src/5021000/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98
  99 #ifndef STATIC
 100 #define STATIC  static
 101 #endif
 102
 103
 104 struct RExC_state_t {
 105     U32         flags;                  /* RXf_* are we folding, multilining? */
 106     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object
 111                                            pprivate field */
 112     char        *start;                 /* Start of input for compile */
 113     char        *end;                   /* End of input for compile */
 114     char        *parse;                 /* Input-scan pointer. */
 115     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 116     regnode     *emit_start;            /* Start of emitted-code area */
 117     regnode     *emit_bound;            /* First regnode outside of the
 118                                            allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 120                                            implies compiling, so don't emit */
 121     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 122                                            large enough for the largest
 123                                            non-EXACTish node, so can use it as
 124                                            scratch in pass1 */
 125     I32         naughty;                /* How bad is this pattern? */
 126     I32         sawback;                /* Did we see \1, ...? */
 127     U32         seen;
 128     SSize_t     size;                   /* Code size. */
 129     I32                npar;            /* Capture buffer count, (OPEN) plus
 130                                            one. ("par" 0 is the whole
 131                                            pattern)*/
 132     I32         nestroot;               /* root parens we are in - used by
 133                                            accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 151                                            through */
 152     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         contains_i;
 156     I32         override_recoding;
 157     I32         in_multi_char_class;
 158     struct reg_code_block *code_blocks; /* positions of literal (?{})
 159                                             within pattern */
 160     int         num_code_blocks;        /* size of code_blocks[] */
 161     int         code_index;             /* next code_blocks[] slot */
 162     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 163 #ifdef ADD_TO_REGEXEC
 164     char        *starttry;              /* -Dr: where regtry was called. */
 165 #define RExC_starttry   (pRExC_state->starttry)
 166 #endif
 167     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169     const char  *lastparse;
 170     I32         lastnum;
 171     AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse  (pRExC_state->lastparse)
 173 #define RExC_lastnum    (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 };
 177
 178 #define RExC_flags      (pRExC_state->flags)
 179 #define RExC_pm_flags   (pRExC_state->pm_flags)
 180 #define RExC_precomp    (pRExC_state->precomp)
 181 #define RExC_rx_sv      (pRExC_state->rx_sv)
 182 #define RExC_rx         (pRExC_state->rx)
 183 #define RExC_rxi        (pRExC_state->rxi)
 184 #define RExC_start      (pRExC_state->start)
 185 #define RExC_end        (pRExC_state->end)
 186 #define RExC_parse      (pRExC_state->parse)
 187 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 190                                                          others */
 191 #endif
 192 #define RExC_emit       (pRExC_state->emit)
 193 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 194 #define RExC_emit_start (pRExC_state->emit_start)
 195 #define RExC_emit_bound (pRExC_state->emit_bound)
 196 #define RExC_naughty    (pRExC_state->naughty)
 197 #define RExC_sawback    (pRExC_state->sawback)
 198 #define RExC_seen       (pRExC_state->seen)
 199 #define RExC_size       (pRExC_state->size)
 200 #define RExC_maxlen        (pRExC_state->maxlen)
 201 #define RExC_npar       (pRExC_state->npar)
 202 #define RExC_nestroot   (pRExC_state->nestroot)
 203 #define RExC_extralen   (pRExC_state->extralen)
 204 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 205 #define RExC_utf8       (pRExC_state->utf8)
 206 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 207 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 208 #define RExC_open_parens        (pRExC_state->open_parens)
 209 #define RExC_close_parens       (pRExC_state->close_parens)
 210 #define RExC_opend      (pRExC_state->opend)
 211 #define RExC_paren_names        (pRExC_state->paren_names)
 212 #define RExC_recurse    (pRExC_state->recurse)
 213 #define RExC_recurse_count      (pRExC_state->recurse_count)
 214 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 215 #define RExC_study_chunk_recursed_bytes  \
 216                                    (pRExC_state->study_chunk_recursed_bytes)
 217 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 218 #define RExC_contains_locale    (pRExC_state->contains_locale)
 219 #define RExC_contains_i (pRExC_state->contains_i)
 220 #define RExC_override_recoding (pRExC_state->override_recoding)
 221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 222
 223
 224 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 225 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 226         ((*s) == '{' && regcurly(s, FALSE)))
 227
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 235  * character.  (There needs to be a case: in the switch statement in regexec.c
 236  * for any node marked SIMPLE.)  Note that this is not the same thing as
 237  * REGNODE_SIMPLE */
 238 #define SIMPLE          0x02
 239 #define SPSTART         0x04    /* Starts with * or + */
 240 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 241 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 242 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 243
 244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 245
 246 /* whether trie related optimizations are enabled */
 247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 248 #define TRIE_STUDY_OPT
 249 #define FULL_TRIE_STUDY
 250 #define TRIE_STCLASS
 251 #endif
 252
 253
 254
 255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 256 #define PBITVAL(paren) (1 << ((paren) & 7))
 257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 260
 261 #define REQUIRE_UTF8    STMT_START {                                       \
 262                                      if (!UTF) {                           \
 263                                          *flagp = RESTART_UTF8;            \
 264                                          return NULL;                      \
 265                                      }                                     \
 266                         } STMT_END
 267
 268 /* This converts the named class defined in regcomp.h to its equivalent class
 269  * number defined in handy.h. */
 270 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 271 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 272
 273 #define _invlist_union_complement_2nd(a, b, output) \
 274                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 275 #define _invlist_intersection_complement_2nd(a, b, output) \
 276                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 277
 278 /* About scan_data_t.
 279
 280   During optimisation we recurse through the regexp program performing
 281   various inplace (keyhole style) optimisations. In addition study_chunk
 282   and scan_commit populate this data structure with information about
 283   what strings MUST appear in the pattern. We look for the longest
 284   string that must appear at a fixed location, and we look for the
 285   longest string that may appear at a floating location. So for instance
 286   in the pattern:
 287
 288     /FOO[xX]A.*B[xX]BAR/
 289
 290   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 291   strings (because they follow a .* construct). study_chunk will identify
 292   both FOO and BAR as being the longest fixed and floating strings respectively.
 293
 294   The strings can be composites, for instance
 295
 296      /(f)(o)(o)/
 297
 298   will result in a composite fixed substring 'foo'.
 299
 300   For each string some basic information is maintained:
 301
 302   - offset or min_offset
 303     This is the position the string must appear at, or not before.
 304     It also implicitly (when combined with minlenp) tells us how many
 305     characters must match before the string we are searching for.
 306     Likewise when combined with minlenp and the length of the string it
 307     tells us how many characters must appear after the string we have
 308     found.
 309
 310   - max_offset
 311     Only used for floating strings. This is the rightmost point that
 312     the string can appear at. If set to SSize_t_MAX it indicates that the
 313     string can occur infinitely far to the right.
 314
 315   - minlenp
 316     A pointer to the minimum number of characters of the pattern that the
 317     string was found inside. This is important as in the case of positive
 318     lookahead or positive lookbehind we can have multiple patterns
 319     involved. Consider
 320
 321     /(?=FOO).*F/
 322
 323     The minimum length of the pattern overall is 3, the minimum length
 324     of the lookahead part is 3, but the minimum length of the part that
 325     will actually match is 1. So 'FOO's minimum length is 3, but the
 326     minimum length for the F is 1. This is important as the minimum length
 327     is used to determine offsets in front of and behind the string being
 328     looked for.  Since strings can be composites this is the length of the
 329     pattern at the time it was committed with a scan_commit. Note that
 330     the length is calculated by study_chunk, so that the minimum lengths
 331     are not known until the full pattern has been compiled, thus the
 332     pointer to the value.
 333
 334   - lookbehind
 335
 336     In the case of lookbehind the string being searched for can be
 337     offset past the start point of the final matching string.
 338     If this value was just blithely removed from the min_offset it would
 339     invalidate some of the calculations for how many chars must match
 340     before or after (as they are derived from min_offset and minlen and
 341     the length of the string being searched for).
 342     When the final pattern is compiled and the data is moved from the
 343     scan_data_t structure into the regexp structure the information
 344     about lookbehind is factored in, with the information that would
 345     have been lost precalculated in the end_shift field for the
 346     associated string.
 347
 348   The fields pos_min and pos_delta are used to store the minimum offset
 349   and the delta to the maximum offset at the current point in the pattern.
 350
 351 */
 352
 353 typedef struct scan_data_t {
 354     /*I32 len_min;      unused */
 355     /*I32 len_delta;    unused */
 356     SSize_t pos_min;
 357     SSize_t pos_delta;
 358     SV *last_found;
 359     SSize_t last_end;       /* min value, <0 unless valid. */
 360     SSize_t last_start_min;
 361     SSize_t last_start_max;
 362     SV **longest;           /* Either &l_fixed, or &l_float. */
 363     SV *longest_fixed;      /* longest fixed string found in pattern */
 364     SSize_t offset_fixed;   /* offset where it starts */
 365     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 366     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 367     SV *longest_float;      /* longest floating string found in pattern */
 368     SSize_t offset_float_min; /* earliest point in string it can appear */
 369     SSize_t offset_float_max; /* latest point in string it can appear */
 370     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 371     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 372     I32 flags;
 373     I32 whilem_c;
 374     SSize_t *last_closep;
 375     regnode_ssc *start_class;
 376 } scan_data_t;
 377
 378 /* The below is perhaps overboard, but this allows us to save a test at the
 379  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 380  * and 'a' differ by a single bit; the same with the upper and lower case of
 381  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 382  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 383  * then inverts it to form a mask, with just a single 0, in the bit position
 384  * where the upper- and lowercase differ.  XXX There are about 40 other
 385  * instances in the Perl core where this micro-optimization could be used.
 386  * Should decide if maintenance cost is worse, before changing those
 387  *
 388  * Returns a boolean as to whether or not 'v' is either a lowercase or
 389  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 390  * compile-time constant, the generated code is better than some optimizing
 391  * compilers figure out, amounting to a mask and test.  The results are
 392  * meaningless if 'c' is not one of [A-Za-z] */
 393 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 394                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 395
 396 /*
 397  * Forward declarations for pregcomp()'s friends.
 398  */
 399
 400 static const scan_data_t zero_scan_data =
 401   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 402
 403 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 404 #define SF_BEFORE_SEOL          0x0001
 405 #define SF_BEFORE_MEOL          0x0002
 406 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 407 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 408
 409 #define SF_FIX_SHIFT_EOL        (+2)
 410 #define SF_FL_SHIFT_EOL         (+4)
 411
 412 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 413 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 414
 415 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 416 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 417 #define SF_IS_INF               0x0040
 418 #define SF_HAS_PAR              0x0080
 419 #define SF_IN_PAR               0x0100
 420 #define SF_HAS_EVAL             0x0200
 421 #define SCF_DO_SUBSTR           0x0400
 422 #define SCF_DO_STCLASS_AND      0x0800
 423 #define SCF_DO_STCLASS_OR       0x1000
 424 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 425 #define SCF_WHILEM_VISITED_POS  0x2000
 426
 427 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 428 #define SCF_SEEN_ACCEPT         0x8000
 429 #define SCF_TRIE_DOING_RESTUDY 0x10000
 430
 431 #define UTF cBOOL(RExC_utf8)
 432
 433 /* The enums for all these are ordered so things work out correctly */
 434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 436                                                      == REGEX_DEPENDS_CHARSET)
 437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 439                                                      >= REGEX_UNICODE_CHARSET)
 440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 441                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 443                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 445                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 446
 447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 448
 449 /* For programs that want to be strictly Unicode compatible by dying if any
 450  * attempt is made to match a non-Unicode code point against a Unicode
 451  * property.  */
 452 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 453
 454 #define OOB_NAMEDCLASS          -1
 455
 456 /* There is no code point that is out-of-bounds, so this is problematic.  But
 457  * its only current use is to initialize a variable that is always set before
 458  * looked at. */
 459 #define OOB_UNICODE             0xDEADBEEF
 460
 461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 463
 464
 465 /* length of regex to show in messages that don't mark a position within */
 466 #define RegexLengthToShowInErrorMessages 127
 467
 468 /*
 469  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 470  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 471  * op/pragma/warn/regcomp.
 472  */
 473 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 474 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 475
 476 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 477                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 478
 479 #define REPORT_LOCATION_ARGS(offset)            \
 480                 UTF8fARG(UTF, offset, RExC_precomp), \
 481                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 485  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 486  * "...".
 487  */
 488 #define _FAIL(code) STMT_START {                                        \
 489     const char *ellipses = "";                                          \
 490     IV len = RExC_end - RExC_precomp;                                   \
 491                                                                         \
 492     if (!SIZE_ONLY)                                                     \
 493         SAVEFREESV(RExC_rx_sv);                                         \
 494     if (len > RegexLengthToShowInErrorMessages) {                       \
 495         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 496         len = RegexLengthToShowInErrorMessages - 10;                    \
 497         ellipses = "...";                                               \
 498     }                                                                   \
 499     code;                                                               \
 500 } STMT_END
 501
 502 #define FAIL(msg) _FAIL(                            \
 503     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 504             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 505
 506 #define FAIL2(msg,arg) _FAIL(                       \
 507     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 508             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 509
 510 /*
 511  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 512  */
 513 #define Simple_vFAIL(m) STMT_START {                                    \
 514     const IV offset = RExC_parse - RExC_precomp;                        \
 515     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 516             m, REPORT_LOCATION_ARGS(offset));   \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 521  */
 522 #define vFAIL(m) STMT_START {                           \
 523     if (!SIZE_ONLY)                                     \
 524         SAVEFREESV(RExC_rx_sv);                         \
 525     Simple_vFAIL(m);                                    \
 526 } STMT_END
 527
 528 /*
 529  * Like Simple_vFAIL(), but accepts two arguments.
 530  */
 531 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 532     const IV offset = RExC_parse - RExC_precomp;                        \
 533     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 534                       REPORT_LOCATION_ARGS(offset));    \
 535 } STMT_END
 536
 537 /*
 538  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 539  */
 540 #define vFAIL2(m,a1) STMT_START {                       \
 541     if (!SIZE_ONLY)                                     \
 542         SAVEFREESV(RExC_rx_sv);                         \
 543     Simple_vFAIL2(m, a1);                               \
 544 } STMT_END
 545
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts three arguments.
 549  */
 550 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 551     const IV offset = RExC_parse - RExC_precomp;                \
 552     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 553             REPORT_LOCATION_ARGS(offset));      \
 554 } STMT_END
 555
 556 /*
 557  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 558  */
 559 #define vFAIL3(m,a1,a2) STMT_START {                    \
 560     if (!SIZE_ONLY)                                     \
 561         SAVEFREESV(RExC_rx_sv);                         \
 562     Simple_vFAIL3(m, a1, a2);                           \
 563 } STMT_END
 564
 565 /*
 566  * Like Simple_vFAIL(), but accepts four arguments.
 567  */
 568 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 569     const IV offset = RExC_parse - RExC_precomp;                \
 570     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 571             REPORT_LOCATION_ARGS(offset));      \
 572 } STMT_END
 573
 574 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 575     if (!SIZE_ONLY)                                     \
 576         SAVEFREESV(RExC_rx_sv);                         \
 577     Simple_vFAIL4(m, a1, a2, a3);                       \
 578 } STMT_END
 579
 580 /* A specialized version of vFAIL2 that works with UTF8f */
 581 #define vFAIL2utf8f(m, a1) STMT_START { \
 582     const IV offset = RExC_parse - RExC_precomp;   \
 583     if (!SIZE_ONLY)                                \
 584         SAVEFREESV(RExC_rx_sv);                    \
 585     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 586             REPORT_LOCATION_ARGS(offset));         \
 587 } STMT_END
 588
 589
 590 /* m is not necessarily a "literal string", in this macro */
 591 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 592     const IV offset = loc - RExC_precomp;                               \
 593     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 594             m, REPORT_LOCATION_ARGS(offset));       \
 595 } STMT_END
 596
 597 #define ckWARNreg(loc,m) STMT_START {                                   \
 598     const IV offset = loc - RExC_precomp;                               \
 599     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 600             REPORT_LOCATION_ARGS(offset));              \
 601 } STMT_END
 602
 603 #define vWARN_dep(loc, m) STMT_START {                                  \
 604     const IV offset = loc - RExC_precomp;                               \
 605     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 606             REPORT_LOCATION_ARGS(offset));              \
 607 } STMT_END
 608
 609 #define ckWARNdep(loc,m) STMT_START {                                   \
 610     const IV offset = loc - RExC_precomp;                               \
 611     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 612             m REPORT_LOCATION,                                          \
 613             REPORT_LOCATION_ARGS(offset));              \
 614 } STMT_END
 615
 616 #define ckWARNregdep(loc,m) STMT_START {                                \
 617     const IV offset = loc - RExC_precomp;                               \
 618     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 619             m REPORT_LOCATION,                                          \
 620             REPORT_LOCATION_ARGS(offset));              \
 621 } STMT_END
 622
 623 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 624     const IV offset = loc - RExC_precomp;                               \
 625     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 626             m REPORT_LOCATION,                                          \
 627             a1, REPORT_LOCATION_ARGS(offset));  \
 628 } STMT_END
 629
 630 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 631     const IV offset = loc - RExC_precomp;                               \
 632     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 633             a1, REPORT_LOCATION_ARGS(offset));  \
 634 } STMT_END
 635
 636 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 637     const IV offset = loc - RExC_precomp;                               \
 638     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 639             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 640 } STMT_END
 641
 642 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 643     const IV offset = loc - RExC_precomp;                               \
 644     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 645             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 646 } STMT_END
 647
 648 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 649     const IV offset = loc - RExC_precomp;                               \
 650     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 651             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 652 } STMT_END
 653
 654 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 655     const IV offset = loc - RExC_precomp;                               \
 656     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 657             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 658 } STMT_END
 659
 660 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 661     const IV offset = loc - RExC_precomp;                               \
 662     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 663             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 664 } STMT_END
 665
 666
 667 /* Allow for side effects in s */
 668 #define REGC(c,s) STMT_START {                  \
 669     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 670 } STMT_END
 671
 672 /* Macros for recording node offsets.   20001227 mjd@plover.com
 673  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 674  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 675  * Element 0 holds the number n.
 676  * Position is 1 indexed.
 677  */
 678 #ifndef RE_TRACK_PATTERN_OFFSETS
 679 #define Set_Node_Offset_To_R(node,byte)
 680 #define Set_Node_Offset(node,byte)
 681 #define Set_Cur_Node_Offset
 682 #define Set_Node_Length_To_R(node,len)
 683 #define Set_Node_Length(node,len)
 684 #define Set_Node_Cur_Length(node,start)
 685 #define Node_Offset(n)
 686 #define Node_Length(n)
 687 #define Set_Node_Offset_Length(node,offset,len)
 688 #define ProgLen(ri) ri->u.proglen
 689 #define SetProgLen(ri,x) ri->u.proglen = x
 690 #else
 691 #define ProgLen(ri) ri->u.offsets[0]
 692 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 693 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 694     if (! SIZE_ONLY) {                                                  \
 695         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 696                     __LINE__, (int)(node), (int)(byte)));               \
 697         if((node) < 0) {                                                \
 698             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 699                                          (int)(node));                  \
 700         } else {                                                        \
 701             RExC_offsets[2*(node)-1] = (byte);                          \
 702         }                                                               \
 703     }                                                                   \
 704 } STMT_END
 705
 706 #define Set_Node_Offset(node,byte) \
 707     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 709
 710 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 711     if (! SIZE_ONLY) {                                                  \
 712         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 713                 __LINE__, (int)(node), (int)(len)));                    \
 714         if((node) < 0) {                                                \
 715             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 716                                          (int)(node));                  \
 717         } else {                                                        \
 718             RExC_offsets[2*(node)] = (len);                             \
 719         }                                                               \
 720     }                                                                   \
 721 } STMT_END
 722
 723 #define Set_Node_Length(node,len) \
 724     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 725 #define Set_Node_Cur_Length(node, start)                \
 726     Set_Node_Length(node, RExC_parse - start)
 727
 728 /* Get offsets and lengths */
 729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 731
 732 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 733     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 734     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 735 } STMT_END
 736 #endif
 737
 738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 739 #define EXPERIMENTAL_INPLACESCAN
 740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 741
 742 #define DEBUG_RExC_seen() \
 743         DEBUG_OPTIMISE_MORE_r({                                             \
 744             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 745                                                                             \
 746             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 747                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 748                                                                             \
 749             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 750                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 751                                                                             \
 752             if (RExC_seen & REG_GPOS_SEEN)                                  \
 753                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 754                                                                             \
 755             if (RExC_seen & REG_CANY_SEEN)                                  \
 756                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 757                                                                             \
 758             if (RExC_seen & REG_RECURSE_SEEN)                               \
 759                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 760                                                                             \
 761             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 762                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 763                                                                             \
 764             if (RExC_seen & REG_VERBARG_SEEN)                               \
 765                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 766                                                                             \
 767             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 768                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 769                                                                             \
 770             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 771                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 772                                                                             \
 773             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 774                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 775                                                                             \
 776             if (RExC_seen & REG_GOSTART_SEEN)                               \
 777                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 778                                                                             \
 779             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 780                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 781                                                                             \
 782             PerlIO_printf(Perl_debug_log,"\n");                             \
 783         });
 784
 785 #define DEBUG_STUDYDATA(str,data,depth)                              \
 786 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 787     PerlIO_printf(Perl_debug_log,                                    \
 788         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 789         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 790         (int)(depth)*2, "",                                          \
 791         (IV)((data)->pos_min),                                       \
 792         (IV)((data)->pos_delta),                                     \
 793         (UV)((data)->flags),                                         \
 794         (IV)((data)->whilem_c),                                      \
 795         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 796         is_inf ? "INF " : ""                                         \
 797     );                                                               \
 798     if ((data)->last_found)                                          \
 799         PerlIO_printf(Perl_debug_log,                                \
 800             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 801             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 802             SvPVX_const((data)->last_found),                         \
 803             (IV)((data)->last_end),                                  \
 804             (IV)((data)->last_start_min),                            \
 805             (IV)((data)->last_start_max),                            \
 806             ((data)->longest &&                                      \
 807              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 808             SvPVX_const((data)->longest_fixed),                      \
 809             (IV)((data)->offset_fixed),                              \
 810             ((data)->longest &&                                      \
 811              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 812             SvPVX_const((data)->longest_float),                      \
 813             (IV)((data)->offset_float_min),                          \
 814             (IV)((data)->offset_float_max)                           \
 815         );                                                           \
 816     PerlIO_printf(Perl_debug_log,"\n");                              \
 817 });
 818
 819 /* Mark that we cannot extend a found fixed substring at this point.
 820    Update the longest found anchored substring and the longest found
 821    floating substrings if needed. */
 822
 823 STATIC void
 824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 825                     SSize_t *minlenp, int is_inf)
 826 {
 827     const STRLEN l = CHR_SVLEN(data->last_found);
 828     const STRLEN old_l = CHR_SVLEN(*data->longest);
 829     GET_RE_DEBUG_FLAGS_DECL;
 830
 831     PERL_ARGS_ASSERT_SCAN_COMMIT;
 832
 833     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 834         SvSetMagicSV(*data->longest, data->last_found);
 835         if (*data->longest == data->longest_fixed) {
 836             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 837             if (data->flags & SF_BEFORE_EOL)
 838                 data->flags
 839                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 840             else
 841                 data->flags &= ~SF_FIX_BEFORE_EOL;
 842             data->minlen_fixed=minlenp;
 843             data->lookbehind_fixed=0;
 844         }
 845         else { /* *data->longest == data->longest_float */
 846             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 847             data->offset_float_max = (l
 848                                       ? data->last_start_max
 849                                       : (data->pos_delta == SSize_t_MAX
 850                                          ? SSize_t_MAX
 851                                          : data->pos_min + data->pos_delta));
 852             if (is_inf
 853                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 854                 data->offset_float_max = SSize_t_MAX;
 855             if (data->flags & SF_BEFORE_EOL)
 856                 data->flags
 857                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 858             else
 859                 data->flags &= ~SF_FL_BEFORE_EOL;
 860             data->minlen_float=minlenp;
 861             data->lookbehind_float=0;
 862         }
 863     }
 864     SvCUR_set(data->last_found, 0);
 865     {
 866         SV * const sv = data->last_found;
 867         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 868             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 869             if (mg)
 870                 mg->mg_len = 0;
 871         }
 872     }
 873     data->last_end = -1;
 874     data->flags &= ~SF_BEFORE_EOL;
 875     DEBUG_STUDYDATA("commit: ",data,0);
 876 }
 877
 878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 879  * list that describes which code points it matches */
 880
 881 STATIC void
 882 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 883 {
 884     /* Set the SSC 'ssc' to match an empty string or any code point */
 885
 886     PERL_ARGS_ASSERT_SSC_ANYTHING;
 887
 888     assert(is_ANYOF_SYNTHETIC(ssc));
 889
 890     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 891     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 892     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 893 }
 894
 895 STATIC int
 896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 897 {
 898     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 899      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 900      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 901      * in any way, so there's no point in using it */
 902
 903     UV start, end;
 904     bool ret;
 905
 906     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 907
 908     assert(is_ANYOF_SYNTHETIC(ssc));
 909
 910     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 911         return FALSE;
 912     }
 913
 914     /* See if the list consists solely of the range 0 - Infinity */
 915     invlist_iterinit(ssc->invlist);
 916     ret = invlist_iternext(ssc->invlist, &start, &end)
 917           && start == 0
 918           && end == UV_MAX;
 919
 920     invlist_iterfinish(ssc->invlist);
 921
 922     if (ret) {
 923         return TRUE;
 924     }
 925
 926     /* If e.g., both \w and \W are set, matches everything */
 927     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 928         int i;
 929         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 930             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 931                 return TRUE;
 932             }
 933         }
 934     }
 935
 936     return FALSE;
 937 }
 938
 939 STATIC void
 940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 941 {
 942     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 943      * string, any code point, or any posix class under locale */
 944
 945     PERL_ARGS_ASSERT_SSC_INIT;
 946
 947     Zero(ssc, 1, regnode_ssc);
 948     set_ANYOF_SYNTHETIC(ssc);
 949     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 950     ssc_anything(ssc);
 951
 952     /* If any portion of the regex is to operate under locale rules,
 953      * initialization includes it.  The reason this isn't done for all regexes
 954      * is that the optimizer was written under the assumption that locale was
 955      * all-or-nothing.  Given the complexity and lack of documentation in the
 956      * optimizer, and that there are inadequate test cases for locale, many
 957      * parts of it may not work properly, it is safest to avoid locale unless
 958      * necessary. */
 959     if (RExC_contains_locale) {
 960         ANYOF_POSIXL_SETALL(ssc);
 961     }
 962     else {
 963         ANYOF_POSIXL_ZERO(ssc);
 964     }
 965 }
 966
 967 STATIC int
 968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 969                               const regnode_ssc *ssc)
 970 {
 971     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 972      * to the list of code points matched, and locale posix classes; hence does
 973      * not check its flags) */
 974
 975     UV start, end;
 976     bool ret;
 977
 978     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 979
 980     assert(is_ANYOF_SYNTHETIC(ssc));
 981
 982     invlist_iterinit(ssc->invlist);
 983     ret = invlist_iternext(ssc->invlist, &start, &end)
 984           && start == 0
 985           && end == UV_MAX;
 986
 987     invlist_iterfinish(ssc->invlist);
 988
 989     if (! ret) {
 990         return FALSE;
 991     }
 992
 993     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 994         return FALSE;
 995     }
 996
 997     return TRUE;
 998 }
 999
1000 STATIC SV*
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002                                const regnode_charclass* const node)
1003 {
1004     /* Returns a mortal inversion list defining which code points are matched
1005      * by 'node', which is of type ANYOF.  Handles complementing the result if
1006      * appropriate.  If some code points aren't knowable at this time, the
1007      * returned list must, and will, contain every code point that is a
1008      * possibility. */
1009
1010     SV* invlist = sv_2mortal(_new_invlist(0));
1011     SV* only_utf8_locale_invlist = NULL;
1012     unsigned int i;
1013     const U32 n = ARG(node);
1014     bool new_node_has_latin1 = FALSE;
1015
1016     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017
1018     /* Look at the data structure created by S_set_ANYOF_arg() */
1019     if (n != ANYOF_NONBITMAP_EMPTY) {
1020         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1021         AV * const av = MUTABLE_AV(SvRV(rv));
1022         SV **const ary = AvARRAY(av);
1023         assert(RExC_rxi->data->what[n] == 's');
1024
1025         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1026             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027         }
1028         else if (ary[0] && ary[0] != &PL_sv_undef) {
1029
1030             /* Here, no compile-time swash, and there are things that won't be
1031              * known until runtime -- we have to assume it could be anything */
1032             return _add_range_to_invlist(invlist, 0, UV_MAX);
1033         }
1034         else if (ary[3] && ary[3] != &PL_sv_undef) {
1035
1036             /* Here no compile-time swash, and no run-time only data.  Use the
1037              * node's inversion list */
1038             invlist = sv_2mortal(invlist_clone(ary[3]));
1039         }
1040
1041         /* Get the code points valid only under UTF-8 locales */
1042         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1043             && ary[2] && ary[2] != &PL_sv_undef)
1044         {
1045             only_utf8_locale_invlist = ary[2];
1046         }
1047     }
1048
1049     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1050      * inversion list for the others, but if there are code points that should
1051      * match only conditionally on the target string being UTF-8, those are
1052      * placed in the inversion list, and not the bitmap.  Since there are
1053      * circumstances under which they could match, they are included in the
1054      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1055      * here, so that when we invert below, the end result actually does include
1056      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1057      * before we add the unconditionally matched code points */
1058     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1059         _invlist_intersection_complement_2nd(invlist,
1060                                              PL_UpperLatin1,
1061                                              &invlist);
1062     }
1063
1064     /* Add in the points from the bit map */
1065     for (i = 0; i < 256; i++) {
1066         if (ANYOF_BITMAP_TEST(node, i)) {
1067             invlist = add_cp_to_invlist(invlist, i);
1068             new_node_has_latin1 = TRUE;
1069         }
1070     }
1071
1072     /* If this can match all upper Latin1 code points, have to add them
1073      * as well */
1074     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1075         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1076     }
1077
1078     /* Similarly for these */
1079     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1080         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1081     }
1082
1083     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1084         _invlist_invert(invlist);
1085     }
1086     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1087
1088         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1089          * locale.  We can skip this if there are no 0-255 at all. */
1090         _invlist_union(invlist, PL_Latin1, &invlist);
1091     }
1092
1093     /* Similarly add the UTF-8 locale possible matches.  These have to be
1094      * deferred until after the non-UTF-8 locale ones are taken care of just
1095      * above, or it leads to wrong results under ANYOF_INVERT */
1096     if (only_utf8_locale_invlist) {
1097         _invlist_union_maybe_complement_2nd(invlist,
1098                                             only_utf8_locale_invlist,
1099                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1100                                             &invlist);
1101     }
1102
1103     return invlist;
1104 }
1105
1106 /* These two functions currently do the exact same thing */
1107 #define ssc_init_zero           ssc_init
1108
1109 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1111
1112 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1113  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1114  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1115
1116 STATIC void
1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1118                 const regnode_charclass *and_with)
1119 {
1120     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1121      * another SSC or a regular ANYOF class.  Can create false positives. */
1122
1123     SV* anded_cp_list;
1124     U8  anded_flags;
1125
1126     PERL_ARGS_ASSERT_SSC_AND;
1127
1128     assert(is_ANYOF_SYNTHETIC(ssc));
1129
1130     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1131      * the code point inversion list and just the relevant flags */
1132     if (is_ANYOF_SYNTHETIC(and_with)) {
1133         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1134         anded_flags = ANYOF_FLAGS(and_with);
1135
1136         /* XXX This is a kludge around what appears to be deficiencies in the
1137          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1138          * there are paths through the optimizer where it doesn't get weeded
1139          * out when it should.  And if we don't make some extra provision for
1140          * it like the code just below, it doesn't get added when it should.
1141          * This solution is to add it only when AND'ing, which is here, and
1142          * only when what is being AND'ed is the pristine, original node
1143          * matching anything.  Thus it is like adding it to ssc_anything() but
1144          * only when the result is to be AND'ed.  Probably the same solution
1145          * could be adopted for the same problem we have with /l matching,
1146          * which is solved differently in S_ssc_init(), and that would lead to
1147          * fewer false positives than that solution has.  But if this solution
1148          * creates bugs, the consequences are only that a warning isn't raised
1149          * that should be; while the consequences for having /l bugs is
1150          * incorrect matches */
1151         if (ssc_is_anything((regnode_ssc *)and_with)) {
1152             anded_flags |= ANYOF_WARN_SUPER;
1153         }
1154     }
1155     else {
1156         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1157         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1158     }
1159
1160     ANYOF_FLAGS(ssc) &= anded_flags;
1161
1162     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1163      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1164      * 'and_with' may be inverted.  When not inverted, we have the situation of
1165      * computing:
1166      *  (C1 | P1) & (C2 | P2)
1167      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1168      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1169      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1170      *                    <=  ((C1 & C2) | P1 | P2)
1171      * Alternatively, the last few steps could be:
1172      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1173      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1174      *                    <=  (C1 | C2 | (P1 & P2))
1175      * We favor the second approach if either P1 or P2 is non-empty.  This is
1176      * because these components are a barrier to doing optimizations, as what
1177      * they match cannot be known until the moment of matching as they are
1178      * dependent on the current locale, 'AND"ing them likely will reduce or
1179      * eliminate them.
1180      * But we can do better if we know that C1,P1 are in their initial state (a
1181      * frequent occurrence), each matching everything:
1182      *  (<everything>) & (C2 | P2) =  C2 | P2
1183      * Similarly, if C2,P2 are in their initial state (again a frequent
1184      * occurrence), the result is a no-op
1185      *  (C1 | P1) & (<everything>) =  C1 | P1
1186      *
1187      * Inverted, we have
1188      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1189      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1190      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1191      * */
1192
1193     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1194         && ! is_ANYOF_SYNTHETIC(and_with))
1195     {
1196         unsigned int i;
1197
1198         ssc_intersection(ssc,
1199                          anded_cp_list,
1200                          FALSE /* Has already been inverted */
1201                          );
1202
1203         /* If either P1 or P2 is empty, the intersection will be also; can skip
1204          * the loop */
1205         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1206             ANYOF_POSIXL_ZERO(ssc);
1207         }
1208         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1209
1210             /* Note that the Posix class component P from 'and_with' actually
1211              * looks like:
1212              *      P = Pa | Pb | ... | Pn
1213              * where each component is one posix class, such as in [\w\s].
1214              * Thus
1215              *      ~P = ~(Pa | Pb | ... | Pn)
1216              *         = ~Pa & ~Pb & ... & ~Pn
1217              *        <= ~Pa | ~Pb | ... | ~Pn
1218              * The last is something we can easily calculate, but unfortunately
1219              * is likely to have many false positives.  We could do better
1220              * in some (but certainly not all) instances if two classes in
1221              * P have known relationships.  For example
1222              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1223              * So
1224              *      :lower: & :print: = :lower:
1225              * And similarly for classes that must be disjoint.  For example,
1226              * since \s and \w can have no elements in common based on rules in
1227              * the POSIX standard,
1228              *      \w & ^\S = nothing
1229              * Unfortunately, some vendor locales do not meet the Posix
1230              * standard, in particular almost everything by Microsoft.
1231              * The loop below just changes e.g., \w into \W and vice versa */
1232
1233             regnode_charclass_posixl temp;
1234             int add = 1;    /* To calculate the index of the complement */
1235
1236             ANYOF_POSIXL_ZERO(&temp);
1237             for (i = 0; i < ANYOF_MAX; i++) {
1238                 assert(i % 2 != 0
1239                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1240                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1241
1242                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1243                     ANYOF_POSIXL_SET(&temp, i + add);
1244                 }
1245                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1246             }
1247             ANYOF_POSIXL_AND(&temp, ssc);
1248
1249         } /* else ssc already has no posixes */
1250     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1251          in its initial state */
1252     else if (! is_ANYOF_SYNTHETIC(and_with)
1253              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1254     {
1255         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1256          * copy it over 'ssc' */
1257         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1258             if (is_ANYOF_SYNTHETIC(and_with)) {
1259                 StructCopy(and_with, ssc, regnode_ssc);
1260             }
1261             else {
1262                 ssc->invlist = anded_cp_list;
1263                 ANYOF_POSIXL_ZERO(ssc);
1264                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1265                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1266                 }
1267             }
1268         }
1269         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1270                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1271         {
1272             /* One or the other of P1, P2 is non-empty. */
1273             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1275             }
1276             ssc_union(ssc, anded_cp_list, FALSE);
1277         }
1278         else { /* P1 = P2 = empty */
1279             ssc_intersection(ssc, anded_cp_list, FALSE);
1280         }
1281     }
1282 }
1283
1284 STATIC void
1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1286                const regnode_charclass *or_with)
1287 {
1288     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1289      * another SSC or a regular ANYOF class.  Can create false positives if
1290      * 'or_with' is to be inverted. */
1291
1292     SV* ored_cp_list;
1293     U8 ored_flags;
1294
1295     PERL_ARGS_ASSERT_SSC_OR;
1296
1297     assert(is_ANYOF_SYNTHETIC(ssc));
1298
1299     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1300      * the code point inversion list and just the relevant flags */
1301     if (is_ANYOF_SYNTHETIC(or_with)) {
1302         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1303         ored_flags = ANYOF_FLAGS(or_with);
1304     }
1305     else {
1306         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1307         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1308     }
1309
1310     ANYOF_FLAGS(ssc) |= ored_flags;
1311
1312     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1313      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1314      * 'or_with' may be inverted.  When not inverted, we have the simple
1315      * situation of computing:
1316      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1317      * If P1|P2 yields a situation with both a class and its complement are
1318      * set, like having both \w and \W, this matches all code points, and we
1319      * can delete these from the P component of the ssc going forward.  XXX We
1320      * might be able to delete all the P components, but I (khw) am not certain
1321      * about this, and it is better to be safe.
1322      *
1323      * Inverted, we have
1324      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1325      *                         <=  (C1 | P1) | ~C2
1326      *                         <=  (C1 | ~C2) | P1
1327      * (which results in actually simpler code than the non-inverted case)
1328      * */
1329
1330     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1331         && ! is_ANYOF_SYNTHETIC(or_with))
1332     {
1333         /* We ignore P2, leaving P1 going forward */
1334     }   /* else  Not inverted */
1335     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1336         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1337         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1338             unsigned int i;
1339             for (i = 0; i < ANYOF_MAX; i += 2) {
1340                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1341                 {
1342                     ssc_match_all_cp(ssc);
1343                     ANYOF_POSIXL_CLEAR(ssc, i);
1344                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1345                 }
1346             }
1347         }
1348     }
1349
1350     ssc_union(ssc,
1351               ored_cp_list,
1352               FALSE /* Already has been inverted */
1353               );
1354 }
1355
1356 PERL_STATIC_INLINE void
1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1358 {
1359     PERL_ARGS_ASSERT_SSC_UNION;
1360
1361     assert(is_ANYOF_SYNTHETIC(ssc));
1362
1363     _invlist_union_maybe_complement_2nd(ssc->invlist,
1364                                         invlist,
1365                                         invert2nd,
1366                                         &ssc->invlist);
1367 }
1368
1369 PERL_STATIC_INLINE void
1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1371                          SV* const invlist,
1372                          const bool invert2nd)
1373 {
1374     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1375
1376     assert(is_ANYOF_SYNTHETIC(ssc));
1377
1378     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1379                                                invlist,
1380                                                invert2nd,
1381                                                &ssc->invlist);
1382 }
1383
1384 PERL_STATIC_INLINE void
1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1386 {
1387     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1388
1389     assert(is_ANYOF_SYNTHETIC(ssc));
1390
1391     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1392 }
1393
1394 PERL_STATIC_INLINE void
1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1396 {
1397     /* AND just the single code point 'cp' into the SSC 'ssc' */
1398
1399     SV* cp_list = _new_invlist(2);
1400
1401     PERL_ARGS_ASSERT_SSC_CP_AND;
1402
1403     assert(is_ANYOF_SYNTHETIC(ssc));
1404
1405     cp_list = add_cp_to_invlist(cp_list, cp);
1406     ssc_intersection(ssc, cp_list,
1407                      FALSE /* Not inverted */
1408                      );
1409     SvREFCNT_dec_NN(cp_list);
1410 }
1411
1412 PERL_STATIC_INLINE void
1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1414 {
1415     /* Set the SSC 'ssc' to not match any locale things */
1416
1417     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1418
1419     assert(is_ANYOF_SYNTHETIC(ssc));
1420
1421     ANYOF_POSIXL_ZERO(ssc);
1422     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1423 }
1424
1425 STATIC void
1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1427 {
1428     /* The inversion list in the SSC is marked mortal; now we need a more
1429      * permanent copy, which is stored the same way that is done in a regular
1430      * ANYOF node, with the first 256 code points in a bit map */
1431
1432     SV* invlist = invlist_clone(ssc->invlist);
1433
1434     PERL_ARGS_ASSERT_SSC_FINALIZE;
1435
1436     assert(is_ANYOF_SYNTHETIC(ssc));
1437
1438     /* The code in this file assumes that all but these flags aren't relevant
1439      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440      * time we reach here */
1441     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1442
1443     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1444
1445     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1446                                 NULL, NULL, NULL, FALSE);
1447
1448     /* Make sure is clone-safe */
1449     ssc->invlist = NULL;
1450
1451     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1452         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1453     }
1454
1455     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1456 }
1457
1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1459 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1461 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1462                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1463                                : 0 )
1464
1465
1466 #ifdef DEBUGGING
1467 /*
1468    dump_trie(trie,widecharmap,revcharmap)
1469    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1470    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1471
1472    These routines dump out a trie in a somewhat readable format.
1473    The _interim_ variants are used for debugging the interim
1474    tables that are used to generate the final compressed
1475    representation which is what dump_trie expects.
1476
1477    Part of the reason for their existence is to provide a form
1478    of documentation as to how the different representations function.
1479
1480 */
1481
1482 /*
1483   Dumps the final compressed table form of the trie to Perl_debug_log.
1484   Used for debugging make_trie().
1485 */
1486
1487 STATIC void
1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1489             AV *revcharmap, U32 depth)
1490 {
1491     U32 state;
1492     SV *sv=sv_newmortal();
1493     int colwidth= widecharmap ? 6 : 4;
1494     U16 word;
1495     GET_RE_DEBUG_FLAGS_DECL;
1496
1497     PERL_ARGS_ASSERT_DUMP_TRIE;
1498
1499     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1500         (int)depth * 2 + 2,"",
1501         "Match","Base","Ofs" );
1502
1503     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1504         SV ** const tmp = av_fetch( revcharmap, state, 0);
1505         if ( tmp ) {
1506             PerlIO_printf( Perl_debug_log, "%*s",
1507                 colwidth,
1508                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1509                             PL_colors[0], PL_colors[1],
1510                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1511                             PERL_PV_ESCAPE_FIRSTCHAR
1512                 )
1513             );
1514         }
1515     }
1516     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1517         (int)depth * 2 + 2,"");
1518
1519     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1520         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1521     PerlIO_printf( Perl_debug_log, "\n");
1522
1523     for( state = 1 ; state < trie->statecount ; state++ ) {
1524         const U32 base = trie->states[ state ].trans.base;
1525
1526         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1527                                        (int)depth * 2 + 2,"", (UV)state);
1528
1529         if ( trie->states[ state ].wordnum ) {
1530             PerlIO_printf( Perl_debug_log, " W%4X",
1531                                            trie->states[ state ].wordnum );
1532         } else {
1533             PerlIO_printf( Perl_debug_log, "%6s", "" );
1534         }
1535
1536         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1537
1538         if ( base ) {
1539             U32 ofs = 0;
1540
1541             while( ( base + ofs  < trie->uniquecharcount ) ||
1542                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1543                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1544                                                                     != state))
1545                     ofs++;
1546
1547             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1548
1549             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1550                 if ( ( base + ofs >= trie->uniquecharcount )
1551                         && ( base + ofs - trie->uniquecharcount
1552                                                         < trie->lasttrans )
1553                         && trie->trans[ base + ofs
1554                                     - trie->uniquecharcount ].check == state )
1555                 {
1556                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1557                     colwidth,
1558                     (UV)trie->trans[ base + ofs
1559                                              - trie->uniquecharcount ].next );
1560                 } else {
1561                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1562                 }
1563             }
1564
1565             PerlIO_printf( Perl_debug_log, "]");
1566
1567         }
1568         PerlIO_printf( Perl_debug_log, "\n" );
1569     }
1570     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1571                                 (int)depth*2, "");
1572     for (word=1; word <= trie->wordcount; word++) {
1573         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1574             (int)word, (int)(trie->wordinfo[word].prev),
1575             (int)(trie->wordinfo[word].len));
1576     }
1577     PerlIO_printf(Perl_debug_log, "\n" );
1578 }
1579 /*
1580   Dumps a fully constructed but uncompressed trie in list form.
1581   List tries normally only are used for construction when the number of
1582   possible chars (trie->uniquecharcount) is very high.
1583   Used for debugging make_trie().
1584 */
1585 STATIC void
1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1587                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1588                          U32 depth)
1589 {
1590     U32 state;
1591     SV *sv=sv_newmortal();
1592     int colwidth= widecharmap ? 6 : 4;
1593     GET_RE_DEBUG_FLAGS_DECL;
1594
1595     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1596
1597     /* print out the table precompression.  */
1598     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1599         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1600         "------:-----+-----------------\n" );
1601
1602     for( state=1 ; state < next_alloc ; state ++ ) {
1603         U16 charid;
1604
1605         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1606             (int)depth * 2 + 2,"", (UV)state  );
1607         if ( ! trie->states[ state ].wordnum ) {
1608             PerlIO_printf( Perl_debug_log, "%5s| ","");
1609         } else {
1610             PerlIO_printf( Perl_debug_log, "W%4x| ",
1611                 trie->states[ state ].wordnum
1612             );
1613         }
1614         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1615             SV ** const tmp = av_fetch( revcharmap,
1616                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1617             if ( tmp ) {
1618                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1619                     colwidth,
1620                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1621                               colwidth,
1622                               PL_colors[0], PL_colors[1],
1623                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1624                               | PERL_PV_ESCAPE_FIRSTCHAR
1625                     ) ,
1626                     TRIE_LIST_ITEM(state,charid).forid,
1627                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1628                 );
1629                 if (!(charid % 10))
1630                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1631                         (int)((depth * 2) + 14), "");
1632             }
1633         }
1634         PerlIO_printf( Perl_debug_log, "\n");
1635     }
1636 }
1637
1638 /*
1639   Dumps a fully constructed but uncompressed trie in table form.
1640   This is the normal DFA style state transition table, with a few
1641   twists to facilitate compression later.
1642   Used for debugging make_trie().
1643 */
1644 STATIC void
1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1646                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1647                           U32 depth)
1648 {
1649     U32 state;
1650     U16 charid;
1651     SV *sv=sv_newmortal();
1652     int colwidth= widecharmap ? 6 : 4;
1653     GET_RE_DEBUG_FLAGS_DECL;
1654
1655     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1656
1657     /*
1658        print out the table precompression so that we can do a visual check
1659        that they are identical.
1660      */
1661
1662     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1663
1664     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1665         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1666         if ( tmp ) {
1667             PerlIO_printf( Perl_debug_log, "%*s",
1668                 colwidth,
1669                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670                             PL_colors[0], PL_colors[1],
1671                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672                             PERL_PV_ESCAPE_FIRSTCHAR
1673                 )
1674             );
1675         }
1676     }
1677
1678     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1679
1680     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1681         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1682     }
1683
1684     PerlIO_printf( Perl_debug_log, "\n" );
1685
1686     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1687
1688         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1689             (int)depth * 2 + 2,"",
1690             (UV)TRIE_NODENUM( state ) );
1691
1692         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1693             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1694             if (v)
1695                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1696             else
1697                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1698         }
1699         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1700             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1701                                             (UV)trie->trans[ state ].check );
1702         } else {
1703             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1704                                             (UV)trie->trans[ state ].check,
1705             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1706         }
1707     }
1708 }
1709
1710 #endif
1711
1712
1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1714   startbranch: the first branch in the whole branch sequence
1715   first      : start branch of sequence of branch-exact nodes.
1716                May be the same as startbranch
1717   last       : Thing following the last branch.
1718                May be the same as tail.
1719   tail       : item following the branch sequence
1720   count      : words in the sequence
1721   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1722   depth      : indent depth
1723
1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1725
1726 A trie is an N'ary tree where the branches are determined by digital
1727 decomposition of the key. IE, at the root node you look up the 1st character and
1728 follow that branch repeat until you find the end of the branches. Nodes can be
1729 marked as "accepting" meaning they represent a complete word. Eg:
1730
1731   /he|she|his|hers/
1732
1733 would convert into the following structure. Numbers represent states, letters
1734 following numbers represent valid transitions on the letter from that state, if
1735 the number is in square brackets it represents an accepting state, otherwise it
1736 will be in parenthesis.
1737
1738       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1739       |    |
1740       |   (2)
1741       |    |
1742      (1)   +-i->(6)-+-s->[7]
1743       |
1744       +-s->(3)-+-h->(4)-+-e->[5]
1745
1746       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1747
1748 This shows that when matching against the string 'hers' we will begin at state 1
1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1752 single traverse. We store a mapping from accepting to state to which word was
1753 matched, and then when we have multiple possibilities we try to complete the
1754 rest of the regex in the order in which they occured in the alternation.
1755
1756 The only prior NFA like behaviour that would be changed by the TRIE support is
1757 the silent ignoring of duplicate alternations which are of the form:
1758
1759  / (DUPE|DUPE) X? (?{ ... }) Y /x
1760
1761 Thus EVAL blocks following a trie may be called a different number of times with
1762 and without the optimisation. With the optimisations dupes will be silently
1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1764 the following demonstrates:
1765
1766  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1767
1768 which prints out 'word' three times, but
1769
1770  'words'=~/(word|word|word)(?{ print $1 })S/
1771
1772 which doesnt print it out at all. This is due to other optimisations kicking in.
1773
1774 Example of what happens on a structural level:
1775
1776 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1777
1778    1: CURLYM[1] {1,32767}(18)
1779    5:   BRANCH(8)
1780    6:     EXACT <ac>(16)
1781    8:   BRANCH(11)
1782    9:     EXACT <ad>(16)
1783   11:   BRANCH(14)
1784   12:     EXACT <ab>(16)
1785   16:   SUCCEED(0)
1786   17:   NOTHING(18)
1787   18: END(0)
1788
1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1790 and should turn into:
1791
1792    1: CURLYM[1] {1,32767}(18)
1793    5:   TRIE(16)
1794         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1795           <ac>
1796           <ad>
1797           <ab>
1798   16:   SUCCEED(0)
1799   17:   NOTHING(18)
1800   18: END(0)
1801
1802 Cases where tail != last would be like /(?foo|bar)baz/:
1803
1804    1: BRANCH(4)
1805    2:   EXACT <foo>(8)
1806    4: BRANCH(7)
1807    5:   EXACT <bar>(8)
1808    7: TAIL(8)
1809    8: EXACT <baz>(10)
1810   10: END(0)
1811
1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1813 and would end up looking like:
1814
1815     1: TRIE(8)
1816       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1817         <foo>
1818         <bar>
1819    7: TAIL(8)
1820    8: EXACT <baz>(10)
1821   10: END(0)
1822
1823     d = uvchr_to_utf8_flags(d, uv, 0);
1824
1825 is the recommended Unicode-aware way of saying
1826
1827     *(d++) = uv;
1828 */
1829
1830 #define TRIE_STORE_REVCHAR(val)                                            \
1831     STMT_START {                                                           \
1832         if (UTF) {                                                         \
1833             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1834             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1835             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1836             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1837             SvPOK_on(zlopp);                                               \
1838             SvUTF8_on(zlopp);                                              \
1839             av_push(revcharmap, zlopp);                                    \
1840         } else {                                                           \
1841             char ooooff = (char)val;                                           \
1842             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1843         }                                                                  \
1844         } STMT_END
1845
1846 /* This gets the next character from the input, folding it if not already
1847  * folded. */
1848 #define TRIE_READ_CHAR STMT_START {                                           \
1849     wordlen++;                                                                \
1850     if ( UTF ) {                                                              \
1851         /* if it is UTF then it is either already folded, or does not need    \
1852          * folding */                                                         \
1853         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1854     }                                                                         \
1855     else if (folder == PL_fold_latin1) {                                      \
1856         /* This folder implies Unicode rules, which in the range expressible  \
1857          *  by not UTF is the lower case, with the two exceptions, one of     \
1858          *  which should have been taken care of before calling this */       \
1859         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1860         uvc = toLOWER_L1(*uc);                                                \
1861         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1862         len = 1;                                                              \
1863     } else {                                                                  \
1864         /* raw data, will be folded later if needed */                        \
1865         uvc = (U32)*uc;                                                       \
1866         len = 1;                                                              \
1867     }                                                                         \
1868 } STMT_END
1869
1870
1871
1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1873     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1874         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1875         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1876     }                                                           \
1877     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1878     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1879     TRIE_LIST_CUR( state )++;                                   \
1880 } STMT_END
1881
1882 #define TRIE_LIST_NEW(state) STMT_START {                       \
1883     Newxz( trie->states[ state ].trans.list,               \
1884         4, reg_trie_trans_le );                                 \
1885      TRIE_LIST_CUR( state ) = 1;                                \
1886      TRIE_LIST_LEN( state ) = 4;                                \
1887 } STMT_END
1888
1889 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1890     U16 dupe= trie->states[ state ].wordnum;                    \
1891     regnode * const noper_next = regnext( noper );              \
1892                                                                 \
1893     DEBUG_r({                                                   \
1894         /* store the word for dumping */                        \
1895         SV* tmp;                                                \
1896         if (OP(noper) != NOTHING)                               \
1897             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1898         else                                                    \
1899             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1900         av_push( trie_words, tmp );                             \
1901     });                                                         \
1902                                                                 \
1903     curword++;                                                  \
1904     trie->wordinfo[curword].prev   = 0;                         \
1905     trie->wordinfo[curword].len    = wordlen;                   \
1906     trie->wordinfo[curword].accept = state;                     \
1907                                                                 \
1908     if ( noper_next < tail ) {                                  \
1909         if (!trie->jump)                                        \
1910             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1911                                                  sizeof(U16) ); \
1912         trie->jump[curword] = (U16)(noper_next - convert);      \
1913         if (!jumper)                                            \
1914             jumper = noper_next;                                \
1915         if (!nextbranch)                                        \
1916             nextbranch= regnext(cur);                           \
1917     }                                                           \
1918                                                                 \
1919     if ( dupe ) {                                               \
1920         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1921         /* chain, so that when the bits of chain are later    */\
1922         /* linked together, the dups appear in the chain      */\
1923         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1924         trie->wordinfo[dupe].prev = curword;                    \
1925     } else {                                                    \
1926         /* we haven't inserted this word yet.                */ \
1927         trie->states[ state ].wordnum = curword;                \
1928     }                                                           \
1929 } STMT_END
1930
1931
1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1933      ( ( base + charid >=  ucharcount                                   \
1934          && base + charid < ubound                                      \
1935          && state == trie->trans[ base - ucharcount + charid ].check    \
1936          && trie->trans[ base - ucharcount + charid ].next )            \
1937            ? trie->trans[ base - ucharcount + charid ].next             \
1938            : ( state==1 ? special : 0 )                                 \
1939       )
1940
1941 #define MADE_TRIE       1
1942 #define MADE_JUMP_TRIE  2
1943 #define MADE_EXACT_TRIE 4
1944
1945 STATIC I32
1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1947                   regnode *first, regnode *last, regnode *tail,
1948                   U32 word_count, U32 flags, U32 depth)
1949 {
1950     dVAR;
1951     /* first pass, loop through and scan words */
1952     reg_trie_data *trie;
1953     HV *widecharmap = NULL;
1954     AV *revcharmap = newAV();
1955     regnode *cur;
1956     STRLEN len = 0;
1957     UV uvc = 0;
1958     U16 curword = 0;
1959     U32 next_alloc = 0;
1960     regnode *jumper = NULL;
1961     regnode *nextbranch = NULL;
1962     regnode *convert = NULL;
1963     U32 *prev_states; /* temp array mapping each state to previous one */
1964     /* we just use folder as a flag in utf8 */
1965     const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969     AV *trie_words = NULL;
1970     /* along with revcharmap, this only used during construction but both are
1971      * useful during debugging so we store them in the struct when debugging.
1972      */
1973 #else
1974     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975     STRLEN trie_charcount=0;
1976 #endif
1977     SV *re_trie_maxbuff;
1978     GET_RE_DEBUG_FLAGS_DECL;
1979
1980     PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982     PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985     switch (flags) {
1986         case EXACT: break;
1987         case EXACTFA:
1988         case EXACTFU_SS:
1989         case EXACTFU: folder = PL_fold_latin1; break;
1990         case EXACTF:  folder = PL_fold; break;
1991         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992     }
1993
1994     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995     trie->refcount = 1;
1996     trie->startstate = 1;
1997     trie->wordcount = word_count;
1998     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000     if (flags == EXACT)
2001         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005     DEBUG_r({
2006         trie_words = newAV();
2007     });
2008
2009     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010     if (!SvIOK(re_trie_maxbuff)) {
2011         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2012     }
2013     DEBUG_TRIE_COMPILE_r({
2014         PerlIO_printf( Perl_debug_log,
2015           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2016           (int)depth * 2 + 2, "",
2017           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2018           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2019     });
2020
2021    /* Find the node we are going to overwrite */
2022     if ( first == startbranch && OP( last ) != BRANCH ) {
2023         /* whole branch chain */
2024         convert = first;
2025     } else {
2026         /* branch sub-chain */
2027         convert = NEXTOPER( first );
2028     }
2029
2030     /*  -- First loop and Setup --
2031
2032        We first traverse the branches and scan each word to determine if it
2033        contains widechars, and how many unique chars there are, this is
2034        important as we have to build a table with at least as many columns as we
2035        have unique chars.
2036
2037        We use an array of integers to represent the character codes 0..255
2038        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2039        the native representation of the character value as the key and IV's for
2040        the coded index.
2041
2042        *TODO* If we keep track of how many times each character is used we can
2043        remap the columns so that the table compression later on is more
2044        efficient in terms of memory by ensuring the most common value is in the
2045        middle and the least common are on the outside.  IMO this would be better
2046        than a most to least common mapping as theres a decent chance the most
2047        common letter will share a node with the least common, meaning the node
2048        will not be compressible. With a middle is most common approach the worst
2049        case is when we have the least common nodes twice.
2050
2051      */
2052
2053     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2054         regnode *noper = NEXTOPER( cur );
2055         const U8 *uc = (U8*)STRING( noper );
2056         const U8 *e  = uc + STR_LEN( noper );
2057         int foldlen = 0;
2058         U32 wordlen      = 0;         /* required init */
2059         STRLEN minchars = 0;
2060         STRLEN maxchars = 0;
2061         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2062                                                bitmap?*/
2063
2064         if (OP(noper) == NOTHING) {
2065             regnode *noper_next= regnext(noper);
2066             if (noper_next != tail && OP(noper_next) == flags) {
2067                 noper = noper_next;
2068                 uc= (U8*)STRING(noper);
2069                 e= uc + STR_LEN(noper);
2070                 trie->minlen= STR_LEN(noper);
2071             } else {
2072                 trie->minlen= 0;
2073                 continue;
2074             }
2075         }
2076
2077         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2078             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2079                                           regardless of encoding */
2080             if (OP( noper ) == EXACTFU_SS) {
2081                 /* false positives are ok, so just set this */
2082                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2083             }
2084         }
2085         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2086                                            branch */
2087             TRIE_CHARCOUNT(trie)++;
2088             TRIE_READ_CHAR;
2089
2090             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2091              * is in effect.  Under /i, this character can match itself, or
2092              * anything that folds to it.  If not under /i, it can match just
2093              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2094              * all fold to k, and all are single characters.   But some folds
2095              * expand to more than one character, so for example LATIN SMALL
2096              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2097              * the string beginning at 'uc' is 'ffi', it could be matched by
2098              * three characters, or just by the one ligature character. (It
2099              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2100              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2101              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2102              * match.)  The trie needs to know the minimum and maximum number
2103              * of characters that could match so that it can use size alone to
2104              * quickly reject many match attempts.  The max is simple: it is
2105              * the number of folded characters in this branch (since a fold is
2106              * never shorter than what folds to it. */
2107
2108             maxchars++;
2109
2110             /* And the min is equal to the max if not under /i (indicated by
2111              * 'folder' being NULL), or there are no multi-character folds.  If
2112              * there is a multi-character fold, the min is incremented just
2113              * once, for the character that folds to the sequence.  Each
2114              * character in the sequence needs to be added to the list below of
2115              * characters in the trie, but we count only the first towards the
2116              * min number of characters needed.  This is done through the
2117              * variable 'foldlen', which is returned by the macros that look
2118              * for these sequences as the number of bytes the sequence
2119              * occupies.  Each time through the loop, we decrement 'foldlen' by
2120              * how many bytes the current char occupies.  Only when it reaches
2121              * 0 do we increment 'minchars' or look for another multi-character
2122              * sequence. */
2123             if (folder == NULL) {
2124                 minchars++;
2125             }
2126             else if (foldlen > 0) {
2127                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2128             }
2129             else {
2130                 minchars++;
2131
2132                 /* See if *uc is the beginning of a multi-character fold.  If
2133                  * so, we decrement the length remaining to look at, to account
2134                  * for the current character this iteration.  (We can use 'uc'
2135                  * instead of the fold returned by TRIE_READ_CHAR because for
2136                  * non-UTF, the latin1_safe macro is smart enough to account
2137                  * for all the unfolded characters, and because for UTF, the
2138                  * string will already have been folded earlier in the
2139                  * compilation process */
2140                 if (UTF) {
2141                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2142                         foldlen -= UTF8SKIP(uc);
2143                     }
2144                 }
2145                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2146                     foldlen--;
2147                 }
2148             }
2149
2150             /* The current character (and any potential folds) should be added
2151              * to the possible matching characters for this position in this
2152              * branch */
2153             if ( uvc < 256 ) {
2154                 if ( folder ) {
2155                     U8 folded= folder[ (U8) uvc ];
2156                     if ( !trie->charmap[ folded ] ) {
2157                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2158                         TRIE_STORE_REVCHAR( folded );
2159                     }
2160                 }
2161                 if ( !trie->charmap[ uvc ] ) {
2162                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2163                     TRIE_STORE_REVCHAR( uvc );
2164                 }
2165                 if ( set_bit ) {
2166                     /* store the codepoint in the bitmap, and its folded
2167                      * equivalent. */
2168                     TRIE_BITMAP_SET(trie, uvc);
2169
2170                     /* store the folded codepoint */
2171                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2172
2173                     if ( !UTF ) {
2174                         /* store first byte of utf8 representation of
2175                            variant codepoints */
2176                         if (! UVCHR_IS_INVARIANT(uvc)) {
2177                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2178                         }
2179                     }
2180                     set_bit = 0; /* We've done our bit :-) */
2181                 }
2182             } else {
2183
2184                 /* XXX We could come up with the list of code points that fold
2185                  * to this using PL_utf8_foldclosures, except not for
2186                  * multi-char folds, as there may be multiple combinations
2187                  * there that could work, which needs to wait until runtime to
2188                  * resolve (The comment about LIGATURE FFI above is such an
2189                  * example */
2190
2191                 SV** svpp;
2192                 if ( !widecharmap )
2193                     widecharmap = newHV();
2194
2195                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2196
2197                 if ( !svpp )
2198                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2199
2200                 if ( !SvTRUE( *svpp ) ) {
2201                     sv_setiv( *svpp, ++trie->uniquecharcount );
2202                     TRIE_STORE_REVCHAR(uvc);
2203                 }
2204             }
2205         } /* end loop through characters in this branch of the trie */
2206
2207         /* We take the min and max for this branch and combine to find the min
2208          * and max for all branches processed so far */
2209         if( cur == first ) {
2210             trie->minlen = minchars;
2211             trie->maxlen = maxchars;
2212         } else if (minchars < trie->minlen) {
2213             trie->minlen = minchars;
2214         } else if (maxchars > trie->maxlen) {
2215             trie->maxlen = maxchars;
2216         }
2217     } /* end first pass */
2218     DEBUG_TRIE_COMPILE_r(
2219         PerlIO_printf( Perl_debug_log,
2220                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2221                 (int)depth * 2 + 2,"",
2222                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2223                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2224                 (int)trie->minlen, (int)trie->maxlen )
2225     );
2226
2227     /*
2228         We now know what we are dealing with in terms of unique chars and
2229         string sizes so we can calculate how much memory a naive
2230         representation using a flat table  will take. If it's over a reasonable
2231         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2232         conservative but potentially much slower representation using an array
2233         of lists.
2234
2235         At the end we convert both representations into the same compressed
2236         form that will be used in regexec.c for matching with. The latter
2237         is a form that cannot be used to construct with but has memory
2238         properties similar to the list form and access properties similar
2239         to the table form making it both suitable for fast searches and
2240         small enough that its feasable to store for the duration of a program.
2241
2242         See the comment in the code where the compressed table is produced
2243         inplace from the flat tabe representation for an explanation of how
2244         the compression works.
2245
2246     */
2247
2248
2249     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2250     prev_states[1] = 0;
2251
2252     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2253                                                     > SvIV(re_trie_maxbuff) )
2254     {
2255         /*
2256             Second Pass -- Array Of Lists Representation
2257
2258             Each state will be represented by a list of charid:state records
2259             (reg_trie_trans_le) the first such element holds the CUR and LEN
2260             points of the allocated array. (See defines above).
2261
2262             We build the initial structure using the lists, and then convert
2263             it into the compressed table form which allows faster lookups
2264             (but cant be modified once converted).
2265         */
2266
2267         STRLEN transcount = 1;
2268
2269         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2270             "%*sCompiling trie using list compiler\n",
2271             (int)depth * 2 + 2, ""));
2272
2273         trie->states = (reg_trie_state *)
2274             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2275                                   sizeof(reg_trie_state) );
2276         TRIE_LIST_NEW(1);
2277         next_alloc = 2;
2278
2279         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2280
2281             regnode *noper   = NEXTOPER( cur );
2282             U8 *uc           = (U8*)STRING( noper );
2283             const U8 *e      = uc + STR_LEN( noper );
2284             U32 state        = 1;         /* required init */
2285             U16 charid       = 0;         /* sanity init */
2286             U32 wordlen      = 0;         /* required init */
2287
2288             if (OP(noper) == NOTHING) {
2289                 regnode *noper_next= regnext(noper);
2290                 if (noper_next != tail && OP(noper_next) == flags) {
2291                     noper = noper_next;
2292                     uc= (U8*)STRING(noper);
2293                     e= uc + STR_LEN(noper);
2294                 }
2295             }
2296
2297             if (OP(noper) != NOTHING) {
2298                 for ( ; uc < e ; uc += len ) {
2299
2300                     TRIE_READ_CHAR;
2301
2302                     if ( uvc < 256 ) {
2303                         charid = trie->charmap[ uvc ];
2304                     } else {
2305                         SV** const svpp = hv_fetch( widecharmap,
2306                                                     (char*)&uvc,
2307                                                     sizeof( UV ),
2308                                                     0);
2309                         if ( !svpp ) {
2310                             charid = 0;
2311                         } else {
2312                             charid=(U16)SvIV( *svpp );
2313                         }
2314                     }
2315                     /* charid is now 0 if we dont know the char read, or
2316                      * nonzero if we do */
2317                     if ( charid ) {
2318
2319                         U16 check;
2320                         U32 newstate = 0;
2321
2322                         charid--;
2323                         if ( !trie->states[ state ].trans.list ) {
2324                             TRIE_LIST_NEW( state );
2325                         }
2326                         for ( check = 1;
2327                               check <= TRIE_LIST_USED( state );
2328                               check++ )
2329                         {
2330                             if ( TRIE_LIST_ITEM( state, check ).forid
2331                                                                     == charid )
2332                             {
2333                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2334                                 break;
2335                             }
2336                         }
2337                         if ( ! newstate ) {
2338                             newstate = next_alloc++;
2339                             prev_states[newstate] = state;
2340                             TRIE_LIST_PUSH( state, charid, newstate );
2341                             transcount++;
2342                         }
2343                         state = newstate;
2344                     } else {
2345                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2346                     }
2347                 }
2348             }
2349             TRIE_HANDLE_WORD(state);
2350
2351         } /* end second pass */
2352
2353         /* next alloc is the NEXT state to be allocated */
2354         trie->statecount = next_alloc;
2355         trie->states = (reg_trie_state *)
2356             PerlMemShared_realloc( trie->states,
2357                                    next_alloc
2358                                    * sizeof(reg_trie_state) );
2359
2360         /* and now dump it out before we compress it */
2361         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2362                                                          revcharmap, next_alloc,
2363                                                          depth+1)
2364         );
2365
2366         trie->trans = (reg_trie_trans *)
2367             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2368         {
2369             U32 state;
2370             U32 tp = 0;
2371             U32 zp = 0;
2372
2373
2374             for( state=1 ; state < next_alloc ; state ++ ) {
2375                 U32 base=0;
2376
2377                 /*
2378                 DEBUG_TRIE_COMPILE_MORE_r(
2379                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2380                 );
2381                 */
2382
2383                 if (trie->states[state].trans.list) {
2384                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2385                     U16 maxid=minid;
2386                     U16 idx;
2387
2388                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2389                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2390                         if ( forid < minid ) {
2391                             minid=forid;
2392                         } else if ( forid > maxid ) {
2393                             maxid=forid;
2394                         }
2395                     }
2396                     if ( transcount < tp + maxid - minid + 1) {
2397                         transcount *= 2;
2398                         trie->trans = (reg_trie_trans *)
2399                             PerlMemShared_realloc( trie->trans,
2400                                                      transcount
2401                                                      * sizeof(reg_trie_trans) );
2402                         Zero( trie->trans + (transcount / 2),
2403                               transcount / 2,
2404                               reg_trie_trans );
2405                     }
2406                     base = trie->uniquecharcount + tp - minid;
2407                     if ( maxid == minid ) {
2408                         U32 set = 0;
2409                         for ( ; zp < tp ; zp++ ) {
2410                             if ( ! trie->trans[ zp ].next ) {
2411                                 base = trie->uniquecharcount + zp - minid;
2412                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2413                                                                    1).newstate;
2414                                 trie->trans[ zp ].check = state;
2415                                 set = 1;
2416                                 break;
2417                             }
2418                         }
2419                         if ( !set ) {
2420                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2421                                                                    1).newstate;
2422                             trie->trans[ tp ].check = state;
2423                             tp++;
2424                             zp = tp;
2425                         }
2426                     } else {
2427                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2428                             const U32 tid = base
2429                                            - trie->uniquecharcount
2430                                            + TRIE_LIST_ITEM( state, idx ).forid;
2431                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2432                                                                 idx ).newstate;
2433                             trie->trans[ tid ].check = state;
2434                         }
2435                         tp += ( maxid - minid + 1 );
2436                     }
2437                     Safefree(trie->states[ state ].trans.list);
2438                 }
2439                 /*
2440                 DEBUG_TRIE_COMPILE_MORE_r(
2441                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2442                 );
2443                 */
2444                 trie->states[ state ].trans.base=base;
2445             }
2446             trie->lasttrans = tp + 1;
2447         }
2448     } else {
2449         /*
2450            Second Pass -- Flat Table Representation.
2451
2452            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2453            each.  We know that we will need Charcount+1 trans at most to store
2454            the data (one row per char at worst case) So we preallocate both
2455            structures assuming worst case.
2456
2457            We then construct the trie using only the .next slots of the entry
2458            structs.
2459
2460            We use the .check field of the first entry of the node temporarily
2461            to make compression both faster and easier by keeping track of how
2462            many non zero fields are in the node.
2463
2464            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2465            transition.
2466
2467            There are two terms at use here: state as a TRIE_NODEIDX() which is
2468            a number representing the first entry of the node, and state as a
2469            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2470            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2471            if there are 2 entrys per node. eg:
2472
2473              A B       A B
2474           1. 2 4    1. 3 7
2475           2. 0 3    3. 0 5
2476           3. 0 0    5. 0 0
2477           4. 0 0    7. 0 0
2478
2479            The table is internally in the right hand, idx form. However as we
2480            also have to deal with the states array which is indexed by nodenum
2481            we have to use TRIE_NODENUM() to convert.
2482
2483         */
2484         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2485             "%*sCompiling trie using table compiler\n",
2486             (int)depth * 2 + 2, ""));
2487
2488         trie->trans = (reg_trie_trans *)
2489             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2490                                   * trie->uniquecharcount + 1,
2491                                   sizeof(reg_trie_trans) );
2492         trie->states = (reg_trie_state *)
2493             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2494                                   sizeof(reg_trie_state) );
2495         next_alloc = trie->uniquecharcount + 1;
2496
2497
2498         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2499
2500             regnode *noper   = NEXTOPER( cur );
2501             const U8 *uc     = (U8*)STRING( noper );
2502             const U8 *e      = uc + STR_LEN( noper );
2503
2504             U32 state        = 1;         /* required init */
2505
2506             U16 charid       = 0;         /* sanity init */
2507             U32 accept_state = 0;         /* sanity init */
2508
2509             U32 wordlen      = 0;         /* required init */
2510
2511             if (OP(noper) == NOTHING) {
2512                 regnode *noper_next= regnext(noper);
2513                 if (noper_next != tail && OP(noper_next) == flags) {
2514                     noper = noper_next;
2515                     uc= (U8*)STRING(noper);
2516                     e= uc + STR_LEN(noper);
2517                 }
2518             }
2519
2520             if ( OP(noper) != NOTHING ) {
2521                 for ( ; uc < e ; uc += len ) {
2522
2523                     TRIE_READ_CHAR;
2524
2525                     if ( uvc < 256 ) {
2526                         charid = trie->charmap[ uvc ];
2527                     } else {
2528                         SV* const * const svpp = hv_fetch( widecharmap,
2529                                                            (char*)&uvc,
2530                                                            sizeof( UV ),
2531                                                            0);
2532                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2533                     }
2534                     if ( charid ) {
2535                         charid--;
2536                         if ( !trie->trans[ state + charid ].next ) {
2537                             trie->trans[ state + charid ].next = next_alloc;
2538                             trie->trans[ state ].check++;
2539                             prev_states[TRIE_NODENUM(next_alloc)]
2540                                     = TRIE_NODENUM(state);
2541                             next_alloc += trie->uniquecharcount;
2542                         }
2543                         state = trie->trans[ state + charid ].next;
2544                     } else {
2545                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2546                     }
2547                     /* charid is now 0 if we dont know the char read, or
2548                      * nonzero if we do */
2549                 }
2550             }
2551             accept_state = TRIE_NODENUM( state );
2552             TRIE_HANDLE_WORD(accept_state);
2553
2554         } /* end second pass */
2555
2556         /* and now dump it out before we compress it */
2557         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2558                                                           revcharmap,
2559                                                           next_alloc, depth+1));
2560
2561         {
2562         /*
2563            * Inplace compress the table.*
2564
2565            For sparse data sets the table constructed by the trie algorithm will
2566            be mostly 0/FAIL transitions or to put it another way mostly empty.
2567            (Note that leaf nodes will not contain any transitions.)
2568
2569            This algorithm compresses the tables by eliminating most such
2570            transitions, at the cost of a modest bit of extra work during lookup:
2571
2572            - Each states[] entry contains a .base field which indicates the
2573            index in the state[] array wheres its transition data is stored.
2574
2575            - If .base is 0 there are no valid transitions from that node.
2576
2577            - If .base is nonzero then charid is added to it to find an entry in
2578            the trans array.
2579
2580            -If trans[states[state].base+charid].check!=state then the
2581            transition is taken to be a 0/Fail transition. Thus if there are fail
2582            transitions at the front of the node then the .base offset will point
2583            somewhere inside the previous nodes data (or maybe even into a node
2584            even earlier), but the .check field determines if the transition is
2585            valid.
2586
2587            XXX - wrong maybe?
2588            The following process inplace converts the table to the compressed
2589            table: We first do not compress the root node 1,and mark all its
2590            .check pointers as 1 and set its .base pointer as 1 as well. This
2591            allows us to do a DFA construction from the compressed table later,
2592            and ensures that any .base pointers we calculate later are greater
2593            than 0.
2594
2595            - We set 'pos' to indicate the first entry of the second node.
2596
2597            - We then iterate over the columns of the node, finding the first and
2598            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2599            and set the .check pointers accordingly, and advance pos
2600            appropriately and repreat for the next node. Note that when we copy
2601            the next pointers we have to convert them from the original
2602            NODEIDX form to NODENUM form as the former is not valid post
2603            compression.
2604
2605            - If a node has no transitions used we mark its base as 0 and do not
2606            advance the pos pointer.
2607
2608            - If a node only has one transition we use a second pointer into the
2609            structure to fill in allocated fail transitions from other states.
2610            This pointer is independent of the main pointer and scans forward
2611            looking for null transitions that are allocated to a state. When it
2612            finds one it writes the single transition into the "hole".  If the
2613            pointer doesnt find one the single transition is appended as normal.
2614
2615            - Once compressed we can Renew/realloc the structures to release the
2616            excess space.
2617
2618            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2619            specifically Fig 3.47 and the associated pseudocode.
2620
2621            demq
2622         */
2623         const U32 laststate = TRIE_NODENUM( next_alloc );
2624         U32 state, charid;
2625         U32 pos = 0, zp=0;
2626         trie->statecount = laststate;
2627
2628         for ( state = 1 ; state < laststate ; state++ ) {
2629             U8 flag = 0;
2630             const U32 stateidx = TRIE_NODEIDX( state );
2631             const U32 o_used = trie->trans[ stateidx ].check;
2632             U32 used = trie->trans[ stateidx ].check;
2633             trie->trans[ stateidx ].check = 0;
2634
2635             for ( charid = 0;
2636                   used && charid < trie->uniquecharcount;
2637                   charid++ )
2638             {
2639                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2640                     if ( trie->trans[ stateidx + charid ].next ) {
2641                         if (o_used == 1) {
2642                             for ( ; zp < pos ; zp++ ) {
2643                                 if ( ! trie->trans[ zp ].next ) {
2644                                     break;
2645                                 }
2646                             }
2647                             trie->states[ state ].trans.base
2648                                                     = zp
2649                                                       + trie->uniquecharcount
2650                                                       - charid ;
2651                             trie->trans[ zp ].next
2652                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2653                                                              + charid ].next );
2654                             trie->trans[ zp ].check = state;
2655                             if ( ++zp > pos ) pos = zp;
2656                             break;
2657                         }
2658                         used--;
2659                     }
2660                     if ( !flag ) {
2661                         flag = 1;
2662                         trie->states[ state ].trans.base
2663                                        = pos + trie->uniquecharcount - charid ;
2664                     }
2665                     trie->trans[ pos ].next
2666                         = SAFE_TRIE_NODENUM(
2667                                        trie->trans[ stateidx + charid ].next );
2668                     trie->trans[ pos ].check = state;
2669                     pos++;
2670                 }
2671             }
2672         }
2673         trie->lasttrans = pos + 1;
2674         trie->states = (reg_trie_state *)
2675             PerlMemShared_realloc( trie->states, laststate
2676                                    * sizeof(reg_trie_state) );
2677         DEBUG_TRIE_COMPILE_MORE_r(
2678             PerlIO_printf( Perl_debug_log,
2679                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2680                 (int)depth * 2 + 2,"",
2681                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2682                        + 1 ),
2683                 (IV)next_alloc,
2684                 (IV)pos,
2685                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2686             );
2687
2688         } /* end table compress */
2689     }
2690     DEBUG_TRIE_COMPILE_MORE_r(
2691             PerlIO_printf(Perl_debug_log,
2692                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2693                 (int)depth * 2 + 2, "",
2694                 (UV)trie->statecount,
2695                 (UV)trie->lasttrans)
2696     );
2697     /* resize the trans array to remove unused space */
2698     trie->trans = (reg_trie_trans *)
2699         PerlMemShared_realloc( trie->trans, trie->lasttrans
2700                                * sizeof(reg_trie_trans) );
2701
2702     {   /* Modify the program and insert the new TRIE node */
2703         U8 nodetype =(U8)(flags & 0xFF);
2704         char *str=NULL;
2705
2706 #ifdef DEBUGGING
2707         regnode *optimize = NULL;
2708 #ifdef RE_TRACK_PATTERN_OFFSETS
2709
2710         U32 mjd_offset = 0;
2711         U32 mjd_nodelen = 0;
2712 #endif /* RE_TRACK_PATTERN_OFFSETS */
2713 #endif /* DEBUGGING */
2714         /*
2715            This means we convert either the first branch or the first Exact,
2716            depending on whether the thing following (in 'last') is a branch
2717            or not and whther first is the startbranch (ie is it a sub part of
2718            the alternation or is it the whole thing.)
2719            Assuming its a sub part we convert the EXACT otherwise we convert
2720            the whole branch sequence, including the first.
2721          */
2722         /* Find the node we are going to overwrite */
2723         if ( first != startbranch || OP( last ) == BRANCH ) {
2724             /* branch sub-chain */
2725             NEXT_OFF( first ) = (U16)(last - first);
2726 #ifdef RE_TRACK_PATTERN_OFFSETS
2727             DEBUG_r({
2728                 mjd_offset= Node_Offset((convert));
2729                 mjd_nodelen= Node_Length((convert));
2730             });
2731 #endif
2732             /* whole branch chain */
2733         }
2734 #ifdef RE_TRACK_PATTERN_OFFSETS
2735         else {
2736             DEBUG_r({
2737                 const  regnode *nop = NEXTOPER( convert );
2738                 mjd_offset= Node_Offset((nop));
2739                 mjd_nodelen= Node_Length((nop));
2740             });
2741         }
2742         DEBUG_OPTIMISE_r(
2743             PerlIO_printf(Perl_debug_log,
2744                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2745                 (int)depth * 2 + 2, "",
2746                 (UV)mjd_offset, (UV)mjd_nodelen)
2747         );
2748 #endif
2749         /* But first we check to see if there is a common prefix we can
2750            split out as an EXACT and put in front of the TRIE node.  */
2751         trie->startstate= 1;
2752         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2753             U32 state;
2754             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2755                 U32 ofs = 0;
2756                 I32 idx = -1;
2757                 U32 count = 0;
2758                 const U32 base = trie->states[ state ].trans.base;
2759
2760                 if ( trie->states[state].wordnum )
2761                         count = 1;
2762
2763                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2764                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2765                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2766                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2767                     {
2768                         if ( ++count > 1 ) {
2769                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2770                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2771                             if ( state == 1 ) break;
2772                             if ( count == 2 ) {
2773                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2774                                 DEBUG_OPTIMISE_r(
2775                                     PerlIO_printf(Perl_debug_log,
2776                                         "%*sNew Start State=%"UVuf" Class: [",
2777                                         (int)depth * 2 + 2, "",
2778                                         (UV)state));
2779                                 if (idx >= 0) {
2780                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2781                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2782
2783                                     TRIE_BITMAP_SET(trie,*ch);
2784                                     if ( folder )
2785                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2786                                     DEBUG_OPTIMISE_r(
2787                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2788                                     );
2789                                 }
2790                             }
2791                             TRIE_BITMAP_SET(trie,*ch);
2792                             if ( folder )
2793                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2794                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2795                         }
2796                         idx = ofs;
2797                     }
2798                 }
2799                 if ( count == 1 ) {
2800                     SV **tmp = av_fetch( revcharmap, idx, 0);
2801                     STRLEN len;
2802                     char *ch = SvPV( *tmp, len );
2803                     DEBUG_OPTIMISE_r({
2804                         SV *sv=sv_newmortal();
2805                         PerlIO_printf( Perl_debug_log,
2806                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2807                             (int)depth * 2 + 2, "",
2808                             (UV)state, (UV)idx,
2809                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2810                                 PL_colors[0], PL_colors[1],
2811                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2812                                 PERL_PV_ESCAPE_FIRSTCHAR
2813                             )
2814                         );
2815                     });
2816                     if ( state==1 ) {
2817                         OP( convert ) = nodetype;
2818                         str=STRING(convert);
2819                         STR_LEN(convert)=0;
2820                     }
2821                     STR_LEN(convert) += len;
2822                     while (len--)
2823                         *str++ = *ch++;
2824                 } else {
2825 #ifdef DEBUGGING
2826                     if (state>1)
2827                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2828 #endif
2829                     break;
2830                 }
2831             }
2832             trie->prefixlen = (state-1);
2833             if (str) {
2834                 regnode *n = convert+NODE_SZ_STR(convert);
2835                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2836                 trie->startstate = state;
2837                 trie->minlen -= (state - 1);
2838                 trie->maxlen -= (state - 1);
2839 #ifdef DEBUGGING
2840                /* At least the UNICOS C compiler choked on this
2841                 * being argument to DEBUG_r(), so let's just have
2842                 * it right here. */
2843                if (
2844 #ifdef PERL_EXT_RE_BUILD
2845                    1
2846 #else
2847                    DEBUG_r_TEST
2848 #endif
2849                    ) {
2850                    regnode *fix = convert;
2851                    U32 word = trie->wordcount;
2852                    mjd_nodelen++;
2853                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2854                    while( ++fix < n ) {
2855                        Set_Node_Offset_Length(fix, 0, 0);
2856                    }
2857                    while (word--) {
2858                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2859                        if (tmp) {
2860                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2861                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2862                            else
2863                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2864                        }
2865                    }
2866                }
2867 #endif
2868                 if (trie->maxlen) {
2869                     convert = n;
2870                 } else {
2871                     NEXT_OFF(convert) = (U16)(tail - convert);
2872                     DEBUG_r(optimize= n);
2873                 }
2874             }
2875         }
2876         if (!jumper)
2877             jumper = last;
2878         if ( trie->maxlen ) {
2879             NEXT_OFF( convert ) = (U16)(tail - convert);
2880             ARG_SET( convert, data_slot );
2881             /* Store the offset to the first unabsorbed branch in
2882                jump[0], which is otherwise unused by the jump logic.
2883                We use this when dumping a trie and during optimisation. */
2884             if (trie->jump)
2885                 trie->jump[0] = (U16)(nextbranch - convert);
2886
2887             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2888              *   and there is a bitmap
2889              *   and the first "jump target" node we found leaves enough room
2890              * then convert the TRIE node into a TRIEC node, with the bitmap
2891              * embedded inline in the opcode - this is hypothetically faster.
2892              */
2893             if ( !trie->states[trie->startstate].wordnum
2894                  && trie->bitmap
2895                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2896             {
2897                 OP( convert ) = TRIEC;
2898                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2899                 PerlMemShared_free(trie->bitmap);
2900                 trie->bitmap= NULL;
2901             } else
2902                 OP( convert ) = TRIE;
2903
2904             /* store the type in the flags */
2905             convert->flags = nodetype;
2906             DEBUG_r({
2907             optimize = convert
2908                       + NODE_STEP_REGNODE
2909                       + regarglen[ OP( convert ) ];
2910             });
2911             /* XXX We really should free up the resource in trie now,
2912                    as we won't use them - (which resources?) dmq */
2913         }
2914         /* needed for dumping*/
2915         DEBUG_r(if (optimize) {
2916             regnode *opt = convert;
2917
2918             while ( ++opt < optimize) {
2919                 Set_Node_Offset_Length(opt,0,0);
2920             }
2921             /*
2922                 Try to clean up some of the debris left after the
2923                 optimisation.
2924              */
2925             while( optimize < jumper ) {
2926                 mjd_nodelen += Node_Length((optimize));
2927                 OP( optimize ) = OPTIMIZED;
2928                 Set_Node_Offset_Length(optimize,0,0);
2929                 optimize++;
2930             }
2931             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2932         });
2933     } /* end node insert */
2934
2935     /*  Finish populating the prev field of the wordinfo array.  Walk back
2936      *  from each accept state until we find another accept state, and if
2937      *  so, point the first word's .prev field at the second word. If the
2938      *  second already has a .prev field set, stop now. This will be the
2939      *  case either if we've already processed that word's accept state,
2940      *  or that state had multiple words, and the overspill words were
2941      *  already linked up earlier.
2942      */
2943     {
2944         U16 word;
2945         U32 state;
2946         U16 prev;
2947
2948         for (word=1; word <= trie->wordcount; word++) {
2949             prev = 0;
2950             if (trie->wordinfo[word].prev)
2951                 continue;
2952             state = trie->wordinfo[word].accept;
2953             while (state) {
2954                 state = prev_states[state];
2955                 if (!state)
2956                     break;
2957                 prev = trie->states[state].wordnum;
2958                 if (prev)
2959                     break;
2960             }
2961             trie->wordinfo[word].prev = prev;
2962         }
2963         Safefree(prev_states);
2964     }
2965
2966
2967     /* and now dump out the compressed format */
2968     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2969
2970     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2971 #ifdef DEBUGGING
2972     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2973     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2974 #else
2975     SvREFCNT_dec_NN(revcharmap);
2976 #endif
2977     return trie->jump
2978            ? MADE_JUMP_TRIE
2979            : trie->startstate>1
2980              ? MADE_EXACT_TRIE
2981              : MADE_TRIE;
2982 }
2983
2984 STATIC void
2985 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2986 {
2987 /* The Trie is constructed and compressed now so we can build a fail array if
2988  * it's needed
2989
2990    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2991    3.32 in the
2992    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2993    Ullman 1985/88
2994    ISBN 0-201-10088-6
2995
2996    We find the fail state for each state in the trie, this state is the longest
2997    proper suffix of the current state's 'word' that is also a proper prefix of
2998    another word in our trie. State 1 represents the word '' and is thus the
2999    default fail state. This allows the DFA not to have to restart after its
3000    tried and failed a word at a given point, it simply continues as though it
3001    had been matching the other word in the first place.
3002    Consider
3003       'abcdgu'=~/abcdefg|cdgu/
3004    When we get to 'd' we are still matching the first word, we would encounter
3005    'g' which would fail, which would bring us to the state representing 'd' in
3006    the second word where we would try 'g' and succeed, proceeding to match
3007    'cdgu'.
3008  */
3009  /* add a fail transition */
3010     const U32 trie_offset = ARG(source);
3011     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3012     U32 *q;
3013     const U32 ucharcount = trie->uniquecharcount;
3014     const U32 numstates = trie->statecount;
3015     const U32 ubound = trie->lasttrans + ucharcount;
3016     U32 q_read = 0;
3017     U32 q_write = 0;
3018     U32 charid;
3019     U32 base = trie->states[ 1 ].trans.base;
3020     U32 *fail;
3021     reg_ac_data *aho;
3022     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3023     GET_RE_DEBUG_FLAGS_DECL;
3024
3025     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3026 #ifndef DEBUGGING
3027     PERL_UNUSED_ARG(depth);
3028 #endif
3029
3030
3031     ARG_SET( stclass, data_slot );
3032     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3033     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3034     aho->trie=trie_offset;
3035     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3036     Copy( trie->states, aho->states, numstates, reg_trie_state );
3037     Newxz( q, numstates, U32);
3038     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3039     aho->refcount = 1;
3040     fail = aho->fail;
3041     /* initialize fail[0..1] to be 1 so that we always have
3042        a valid final fail state */
3043     fail[ 0 ] = fail[ 1 ] = 1;
3044
3045     for ( charid = 0; charid < ucharcount ; charid++ ) {
3046         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3047         if ( newstate ) {
3048             q[ q_write ] = newstate;
3049             /* set to point at the root */
3050             fail[ q[ q_write++ ] ]=1;
3051         }
3052     }
3053     while ( q_read < q_write) {
3054         const U32 cur = q[ q_read++ % numstates ];
3055         base = trie->states[ cur ].trans.base;
3056
3057         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3058             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3059             if (ch_state) {
3060                 U32 fail_state = cur;
3061                 U32 fail_base;
3062                 do {
3063                     fail_state = fail[ fail_state ];
3064                     fail_base = aho->states[ fail_state ].trans.base;
3065                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3066
3067                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3068                 fail[ ch_state ] = fail_state;
3069                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3070                 {
3071                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3072                 }
3073                 q[ q_write++ % numstates] = ch_state;
3074             }
3075         }
3076     }
3077     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3078        when we fail in state 1, this allows us to use the
3079        charclass scan to find a valid start char. This is based on the principle
3080        that theres a good chance the string being searched contains lots of stuff
3081        that cant be a start char.
3082      */
3083     fail[ 0 ] = fail[ 1 ] = 0;
3084     DEBUG_TRIE_COMPILE_r({
3085         PerlIO_printf(Perl_debug_log,
3086                       "%*sStclass Failtable (%"UVuf" states): 0",
3087                       (int)(depth * 2), "", (UV)numstates
3088         );
3089         for( q_read=1; q_read<numstates; q_read++ ) {
3090             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3091         }
3092         PerlIO_printf(Perl_debug_log, "\n");
3093     });
3094     Safefree(q);
3095     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3096 }
3097
3098
3099 #define DEBUG_PEEP(str,scan,depth) \
3100     DEBUG_OPTIMISE_r({if (scan){ \
3101        SV * const mysv=sv_newmortal(); \
3102        regnode *Next = regnext(scan); \
3103        regprop(RExC_rx, mysv, scan, NULL); \
3104        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3105        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3106        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3107    }});
3108
3109
3110 /* The below joins as many adjacent EXACTish nodes as possible into a single
3111  * one.  The regop may be changed if the node(s) contain certain sequences that
3112  * require special handling.  The joining is only done if:
3113  * 1) there is room in the current conglomerated node to entirely contain the
3114  *    next one.
3115  * 2) they are the exact same node type
3116  *
3117  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3118  * these get optimized out
3119  *
3120  * If a node is to match under /i (folded), the number of characters it matches
3121  * can be different than its character length if it contains a multi-character
3122  * fold.  *min_subtract is set to the total delta number of characters of the
3123  * input nodes.
3124  *
3125  * And *unfolded_multi_char is set to indicate whether or not the node contains
3126  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3127  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3128  * SMALL LETTER SHARP S, as only if the target string being matched against
3129  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3130  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3131  * whose components are all above the Latin1 range are not run-time locale
3132  * dependent, and have already been folded by the time this function is
3133  * called.)
3134  *
3135  * This is as good a place as any to discuss the design of handling these
3136  * multi-character fold sequences.  It's been wrong in Perl for a very long
3137  * time.  There are three code points in Unicode whose multi-character folds
3138  * were long ago discovered to mess things up.  The previous designs for
3139  * dealing with these involved assigning a special node for them.  This
3140  * approach doesn't always work, as evidenced by this example:
3141  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3142  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3143  * would match just the \xDF, it won't be able to handle the case where a
3144  * successful match would have to cross the node's boundary.  The new approach
3145  * that hopefully generally solves the problem generates an EXACTFU_SS node
3146  * that is "sss" in this case.
3147  *
3148  * It turns out that there are problems with all multi-character folds, and not
3149  * just these three.  Now the code is general, for all such cases.  The
3150  * approach taken is:
3151  * 1)   This routine examines each EXACTFish node that could contain multi-
3152  *      character folded sequences.  Since a single character can fold into
3153  *      such a sequence, the minimum match length for this node is less than
3154  *      the number of characters in the node.  This routine returns in
3155  *      *min_subtract how many characters to subtract from the the actual
3156  *      length of the string to get a real minimum match length; it is 0 if
3157  *      there are no multi-char foldeds.  This delta is used by the caller to
3158  *      adjust the min length of the match, and the delta between min and max,
3159  *      so that the optimizer doesn't reject these possibilities based on size
3160  *      constraints.
3161  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3162  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3163  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3164  *      there is a possible fold length change.  That means that a regular
3165  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3166  *      with length changes, and so can be processed faster.  regexec.c takes
3167  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3168  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3169  *      known until runtime).  This saves effort in regex matching.  However,
3170  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3171  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3172  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3173  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3174  *      possibilities for the non-UTF8 patterns are quite simple, except for
3175  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3176  *      members of a fold-pair, and arrays are set up for all of them so that
3177  *      the other member of the pair can be found quickly.  Code elsewhere in
3178  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3179  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3180  *      described in the next item.
3181  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3182  *      validity of the fold won't be known until runtime, and so must remain
3183  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3184  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3185  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3186  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3187  *      The reason this is a problem is that the optimizer part of regexec.c
3188  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3189  *      that a character in the pattern corresponds to at most a single
3190  *      character in the target string.  (And I do mean character, and not byte
3191  *      here, unlike other parts of the documentation that have never been
3192  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3193  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3194  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3195  *      nodes, violate the assumption, and they are the only instances where it
3196  *      is violated.  I'm reluctant to try to change the assumption, as the
3197  *      code involved is impenetrable to me (khw), so instead the code here
3198  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3199  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3200  *      boolean indicating whether or not the node contains such a fold.  When
3201  *      it is true, the caller sets a flag that later causes the optimizer in
3202  *      this file to not set values for the floating and fixed string lengths,
3203  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3204  *      assumption.  Thus, there is no optimization based on string lengths for
3205  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3206  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3207  *      assumption is wrong only in these cases is that all other non-UTF-8
3208  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3209  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3210  *      EXACTF nodes because we don't know at compile time if it actually
3211  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3212  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3213  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3214  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3215  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3216  *      string would require the pattern to be forced into UTF-8, the overhead
3217  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3218  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3219  *      locale.)
3220  *
3221  *      Similarly, the code that generates tries doesn't currently handle
3222  *      not-already-folded multi-char folds, and it looks like a pain to change
3223  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3224  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3225  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3226  *      using /iaa matching will be doing so almost entirely with ASCII
3227  *      strings, so this should rarely be encountered in practice */
3228
3229 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3230     if (PL_regkind[OP(scan)] == EXACT) \
3231         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3232
3233 STATIC U32
3234 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3235                    UV *min_subtract, bool *unfolded_multi_char,
3236                    U32 flags,regnode *val, U32 depth)
3237 {
3238     /* Merge several consecutive EXACTish nodes into one. */
3239     regnode *n = regnext(scan);
3240     U32 stringok = 1;
3241     regnode *next = scan + NODE_SZ_STR(scan);
3242     U32 merged = 0;
3243     U32 stopnow = 0;
3244 #ifdef DEBUGGING
3245     regnode *stop = scan;
3246     GET_RE_DEBUG_FLAGS_DECL;
3247 #else
3248     PERL_UNUSED_ARG(depth);
3249 #endif
3250
3251     PERL_ARGS_ASSERT_JOIN_EXACT;
3252 #ifndef EXPERIMENTAL_INPLACESCAN
3253     PERL_UNUSED_ARG(flags);
3254     PERL_UNUSED_ARG(val);
3255 #endif
3256     DEBUG_PEEP("join",scan,depth);
3257
3258     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3259      * EXACT ones that are mergeable to the current one. */
3260     while (n
3261            && (PL_regkind[OP(n)] == NOTHING
3262                || (stringok && OP(n) == OP(scan)))
3263            && NEXT_OFF(n)
3264            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3265     {
3266
3267         if (OP(n) == TAIL || n > next)
3268             stringok = 0;
3269         if (PL_regkind[OP(n)] == NOTHING) {
3270             DEBUG_PEEP("skip:",n,depth);
3271             NEXT_OFF(scan) += NEXT_OFF(n);
3272             next = n + NODE_STEP_REGNODE;
3273 #ifdef DEBUGGING
3274             if (stringok)
3275                 stop = n;
3276 #endif
3277             n = regnext(n);
3278         }
3279         else if (stringok) {
3280             const unsigned int oldl = STR_LEN(scan);
3281             regnode * const nnext = regnext(n);
3282
3283             /* XXX I (khw) kind of doubt that this works on platforms (should
3284              * Perl ever run on one) where U8_MAX is above 255 because of lots
3285              * of other assumptions */
3286             /* Don't join if the sum can't fit into a single node */
3287             if (oldl + STR_LEN(n) > U8_MAX)
3288                 break;
3289
3290             DEBUG_PEEP("merg",n,depth);
3291             merged++;
3292
3293             NEXT_OFF(scan) += NEXT_OFF(n);
3294             STR_LEN(scan) += STR_LEN(n);
3295             next = n + NODE_SZ_STR(n);
3296             /* Now we can overwrite *n : */
3297             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3298 #ifdef DEBUGGING
3299             stop = next - 1;
3300 #endif
3301             n = nnext;
3302             if (stopnow) break;
3303         }
3304
3305 #ifdef EXPERIMENTAL_INPLACESCAN
3306         if (flags && !NEXT_OFF(n)) {
3307             DEBUG_PEEP("atch", val, depth);
3308             if (reg_off_by_arg[OP(n)]) {
3309                 ARG_SET(n, val - n);
3310             }
3311             else {
3312                 NEXT_OFF(n) = val - n;
3313             }
3314             stopnow = 1;
3315         }
3316 #endif
3317     }
3318
3319     *min_subtract = 0;
3320     *unfolded_multi_char = FALSE;
3321
3322     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3323      * can now analyze for sequences of problematic code points.  (Prior to
3324      * this final joining, sequences could have been split over boundaries, and
3325      * hence missed).  The sequences only happen in folding, hence for any
3326      * non-EXACT EXACTish node */
3327     if (OP(scan) != EXACT) {
3328         U8* s0 = (U8*) STRING(scan);
3329         U8* s = s0;
3330         U8* s_end = s0 + STR_LEN(scan);
3331
3332         int total_count_delta = 0;  /* Total delta number of characters that
3333                                        multi-char folds expand to */
3334
3335         /* One pass is made over the node's string looking for all the
3336          * possibilities.  To avoid some tests in the loop, there are two main
3337          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3338          * non-UTF-8 */
3339         if (UTF) {
3340             U8* folded = NULL;
3341
3342             if (OP(scan) == EXACTFL) {
3343                 U8 *d;
3344
3345                 /* An EXACTFL node would already have been changed to another
3346                  * node type unless there is at least one character in it that
3347                  * is problematic; likely a character whose fold definition
3348                  * won't be known until runtime, and so has yet to be folded.
3349                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3350                  * to handle the UTF-8 case, we need to create a temporary
3351                  * folded copy using UTF-8 locale rules in order to analyze it.
3352                  * This is because our macros that look to see if a sequence is
3353                  * a multi-char fold assume everything is folded (otherwise the
3354                  * tests in those macros would be too complicated and slow).
3355                  * Note that here, the non-problematic folds will have already
3356                  * been done, so we can just copy such characters.  We actually
3357                  * don't completely fold the EXACTFL string.  We skip the
3358                  * unfolded multi-char folds, as that would just create work
3359                  * below to figure out the size they already are */
3360
3361                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3362                 d = folded;
3363                 while (s < s_end) {
3364                     STRLEN s_len = UTF8SKIP(s);
3365                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3366                         Copy(s, d, s_len, U8);
3367                         d += s_len;
3368                     }
3369                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3370                         *unfolded_multi_char = TRUE;
3371                         Copy(s, d, s_len, U8);
3372                         d += s_len;
3373                     }
3374                     else if (isASCII(*s)) {
3375                         *(d++) = toFOLD(*s);
3376                     }
3377                     else {
3378                         STRLEN len;
3379                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3380                         d += len;
3381                     }
3382                     s += s_len;
3383                 }
3384
3385                 /* Point the remainder of the routine to look at our temporary
3386                  * folded copy */
3387                 s = folded;
3388                 s_end = d;
3389             } /* End of creating folded copy of EXACTFL string */
3390
3391             /* Examine the string for a multi-character fold sequence.  UTF-8
3392              * patterns have all characters pre-folded by the time this code is
3393              * executed */
3394             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3395                                      length sequence we are looking for is 2 */
3396             {
3397                 int count = 0;  /* How many characters in a multi-char fold */
3398                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3399                 if (! len) {    /* Not a multi-char fold: get next char */
3400                     s += UTF8SKIP(s);
3401                     continue;
3402                 }
3403
3404                 /* Nodes with 'ss' require special handling, except for
3405                  * EXACTFA-ish for which there is no multi-char fold to this */
3406                 if (len == 2 && *s == 's' && *(s+1) == 's'
3407                     && OP(scan) != EXACTFA
3408                     && OP(scan) != EXACTFA_NO_TRIE)
3409                 {
3410                     count = 2;
3411                     if (OP(scan) != EXACTFL) {
3412                         OP(scan) = EXACTFU_SS;
3413                     }
3414                     s += 2;
3415                 }
3416                 else { /* Here is a generic multi-char fold. */
3417                     U8* multi_end  = s + len;
3418
3419                     /* Count how many characters in it.  In the case of /aa, no
3420                      * folds which contain ASCII code points are allowed, so
3421                      * check for those, and skip if found. */
3422                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3423                         count = utf8_length(s, multi_end);
3424                         s = multi_end;
3425                     }
3426                     else {
3427                         while (s < multi_end) {
3428                             if (isASCII(*s)) {
3429                                 s++;
3430                                 goto next_iteration;
3431                             }
3432                             else {
3433                                 s += UTF8SKIP(s);
3434                             }
3435                             count++;
3436                         }
3437                     }
3438                 }
3439
3440                 /* The delta is how long the sequence is minus 1 (1 is how long
3441                  * the character that folds to the sequence is) */
3442                 total_count_delta += count - 1;
3443               next_iteration: ;
3444             }
3445
3446             /* We created a temporary folded copy of the string in EXACTFL
3447              * nodes.  Therefore we need to be sure it doesn't go below zero,
3448              * as the real string could be shorter */
3449             if (OP(scan) == EXACTFL) {
3450                 int total_chars = utf8_length((U8*) STRING(scan),
3451                                            (U8*) STRING(scan) + STR_LEN(scan));
3452                 if (total_count_delta > total_chars) {
3453                     total_count_delta = total_chars;
3454                 }
3455             }
3456
3457             *min_subtract += total_count_delta;
3458             Safefree(folded);
3459         }
3460         else if (OP(scan) == EXACTFA) {
3461
3462             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3463              * fold to the ASCII range (and there are no existing ones in the
3464              * upper latin1 range).  But, as outlined in the comments preceding
3465              * this function, we need to flag any occurrences of the sharp s.
3466              * This character forbids trie formation (because of added
3467              * complexity) */
3468             while (s < s_end) {
3469                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3470                     OP(scan) = EXACTFA_NO_TRIE;
3471                     *unfolded_multi_char = TRUE;
3472                     break;
3473                 }
3474                 s++;
3475                 continue;
3476             }
3477         }
3478         else {
3479
3480             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3481              * folds that are all Latin1.  As explained in the comments
3482              * preceding this function, we look also for the sharp s in EXACTF
3483              * and EXACTFL nodes; it can be in the final position.  Otherwise
3484              * we can stop looking 1 byte earlier because have to find at least
3485              * two characters for a multi-fold */
3486             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3487                               ? s_end
3488                               : s_end -1;
3489
3490             while (s < upper) {
3491                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3492                 if (! len) {    /* Not a multi-char fold. */
3493                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3494                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3495                     {
3496                         *unfolded_multi_char = TRUE;
3497                     }
3498                     s++;
3499                     continue;
3500                 }
3501
3502                 if (len == 2
3503                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3504                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3505                 {
3506
3507                     /* EXACTF nodes need to know that the minimum length
3508                      * changed so that a sharp s in the string can match this
3509                      * ss in the pattern, but they remain EXACTF nodes, as they
3510                      * won't match this unless the target string is is UTF-8,
3511                      * which we don't know until runtime.  EXACTFL nodes can't
3512                      * transform into EXACTFU nodes */
3513                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3514                         OP(scan) = EXACTFU_SS;
3515                     }
3516                 }
3517
3518                 *min_subtract += len - 1;
3519                 s += len;
3520             }
3521         }
3522     }
3523
3524 #ifdef DEBUGGING
3525     /* Allow dumping but overwriting the collection of skipped
3526      * ops and/or strings with fake optimized ops */
3527     n = scan + NODE_SZ_STR(scan);
3528     while (n <= stop) {
3529         OP(n) = OPTIMIZED;
3530         FLAGS(n) = 0;
3531         NEXT_OFF(n) = 0;
3532         n++;
3533     }
3534 #endif
3535     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3536     return stopnow;
3537 }
3538
3539 /* REx optimizer.  Converts nodes into quicker variants "in place".
3540    Finds fixed substrings.  */
3541
3542 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3543    to the position after last scanned or to NULL. */
3544
3545 #define INIT_AND_WITHP \
3546     assert(!and_withp); \
3547     Newx(and_withp,1, regnode_ssc); \
3548     SAVEFREEPV(and_withp)
3549
3550 /* this is a chain of data about sub patterns we are processing that
3551    need to be handled separately/specially in study_chunk. Its so
3552    we can simulate recursion without losing state.  */
3553 struct scan_frame;
3554 typedef struct scan_frame {
3555     regnode *last;  /* last node to process in this frame */
3556     regnode *next;  /* next node to process when last is reached */
3557     struct scan_frame *prev; /*previous frame*/
3558     U32 prev_recursed_depth;
3559     I32 stop; /* what stopparen do we use */
3560 } scan_frame;
3561
3562
3563 STATIC SSize_t
3564 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3565                         SSize_t *minlenp, SSize_t *deltap,
3566                         regnode *last,
3567                         scan_data_t *data,
3568                         I32 stopparen,
3569                         U32 recursed_depth,
3570                         regnode_ssc *and_withp,
3571                         U32 flags, U32 depth)
3572                         /* scanp: Start here (read-write). */
3573                         /* deltap: Write maxlen-minlen here. */
3574                         /* last: Stop before this one. */
3575                         /* data: string data about the pattern */
3576                         /* stopparen: treat close N as END */
3577                         /* recursed: which subroutines have we recursed into */
3578                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3579 {
3580     dVAR;
3581     /* There must be at least this number of characters to match */
3582     SSize_t min = 0;
3583     I32 pars = 0, code;
3584     regnode *scan = *scanp, *next;
3585     SSize_t delta = 0;
3586     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3587     int is_inf_internal = 0;            /* The studied chunk is infinite */
3588     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3589     scan_data_t data_fake;
3590     SV *re_trie_maxbuff = NULL;
3591     regnode *first_non_open = scan;
3592     SSize_t stopmin = SSize_t_MAX;
3593     scan_frame *frame = NULL;
3594     GET_RE_DEBUG_FLAGS_DECL;
3595
3596     PERL_ARGS_ASSERT_STUDY_CHUNK;
3597
3598 #ifdef DEBUGGING
3599     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3600 #endif
3601     if ( depth == 0 ) {
3602         while (first_non_open && OP(first_non_open) == OPEN)
3603             first_non_open=regnext(first_non_open);
3604     }
3605
3606
3607   fake_study_recurse:
3608     while ( scan && OP(scan) != END && scan < last ){
3609         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3610                                    node length to get a real minimum (because
3611                                    the folded version may be shorter) */
3612         bool unfolded_multi_char = FALSE;
3613         /* Peephole optimizer: */
3614         DEBUG_OPTIMISE_MORE_r(
3615         {
3616             PerlIO_printf(Perl_debug_log,
3617                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3618                 ((int) depth*2), "", (long)stopparen,
3619                 (unsigned long)depth, (unsigned long)recursed_depth);
3620             if (recursed_depth) {
3621                 U32 i;
3622                 U32 j;
3623                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3624                     PerlIO_printf(Perl_debug_log,"[");
3625                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3626                         PerlIO_printf(Perl_debug_log,"%d",
3627                             PAREN_TEST(RExC_study_chunk_recursed +
3628                                        (j * RExC_study_chunk_recursed_bytes), i)
3629                             ? 1 : 0
3630                         );
3631                     PerlIO_printf(Perl_debug_log,"]");
3632                 }
3633             }
3634             PerlIO_printf(Perl_debug_log,"\n");
3635         }
3636         );
3637         DEBUG_STUDYDATA("Peep:", data, depth);
3638         DEBUG_PEEP("Peep", scan, depth);
3639
3640
3641         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3642          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3643          * by a different invocation of reg() -- Yves
3644          */
3645         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3646
3647         /* Follow the next-chain of the current node and optimize
3648            away all the NOTHINGs from it.  */
3649         if (OP(scan) != CURLYX) {
3650             const int max = (reg_off_by_arg[OP(scan)]
3651                        ? I32_MAX
3652                        /* I32 may be smaller than U16 on CRAYs! */
3653                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3654             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3655             int noff;
3656             regnode *n = scan;
3657
3658             /* Skip NOTHING and LONGJMP. */
3659             while ((n = regnext(n))
3660                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3661                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3662                    && off + noff < max)
3663                 off += noff;
3664             if (reg_off_by_arg[OP(scan)])
3665                 ARG(scan) = off;
3666             else
3667                 NEXT_OFF(scan) = off;
3668         }
3669
3670
3671
3672         /* The principal pseudo-switch.  Cannot be a switch, since we
3673            look into several different things.  */
3674         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3675                    || OP(scan) == IFTHEN) {
3676             next = regnext(scan);
3677             code = OP(scan);
3678             /* demq: the op(next)==code check is to see if we have
3679              * "branch-branch" AFAICT */
3680
3681             if (OP(next) == code || code == IFTHEN) {
3682                 /* NOTE - There is similar code to this block below for
3683                  * handling TRIE nodes on a re-study.  If you change stuff here
3684                  * check there too. */
3685                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3686                 regnode_ssc accum;
3687                 regnode * const startbranch=scan;
3688
3689                 if (flags & SCF_DO_SUBSTR) {
3690                     /* Cannot merge strings after this. */
3691                     scan_commit(pRExC_state, data, minlenp, is_inf);
3692                 }
3693
3694                 if (flags & SCF_DO_STCLASS)
3695                     ssc_init_zero(pRExC_state, &accum);
3696
3697                 while (OP(scan) == code) {
3698                     SSize_t deltanext, minnext, fake;
3699                     I32 f = 0;
3700                     regnode_ssc this_class;
3701
3702                     num++;
3703                     data_fake.flags = 0;
3704                     if (data) {
3705                         data_fake.whilem_c = data->whilem_c;
3706                         data_fake.last_closep = data->last_closep;
3707                     }
3708                     else
3709                         data_fake.last_closep = &fake;
3710
3711                     data_fake.pos_delta = delta;
3712                     next = regnext(scan);
3713                     scan = NEXTOPER(scan);
3714                     if (code != BRANCH)
3715                         scan = NEXTOPER(scan);
3716                     if (flags & SCF_DO_STCLASS) {
3717                         ssc_init(pRExC_state, &this_class);
3718                         data_fake.start_class = &this_class;
3719                         f = SCF_DO_STCLASS_AND;
3720                     }
3721                     if (flags & SCF_WHILEM_VISITED_POS)
3722                         f |= SCF_WHILEM_VISITED_POS;
3723
3724                     /* we suppose the run is continuous, last=next...*/
3725                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3726                                       &deltanext, next, &data_fake, stopparen,
3727                                       recursed_depth, NULL, f,depth+1);
3728                     if (min1 > minnext)
3729                         min1 = minnext;
3730                     if (deltanext == SSize_t_MAX) {
3731                         is_inf = is_inf_internal = 1;
3732                         max1 = SSize_t_MAX;
3733                     } else if (max1 < minnext + deltanext)
3734                         max1 = minnext + deltanext;
3735                     scan = next;
3736                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3737                         pars++;
3738                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3739                         if ( stopmin > minnext)
3740                             stopmin = min + min1;
3741                         flags &= ~SCF_DO_SUBSTR;
3742                         if (data)
3743                             data->flags |= SCF_SEEN_ACCEPT;
3744                     }
3745                     if (data) {
3746                         if (data_fake.flags & SF_HAS_EVAL)
3747                             data->flags |= SF_HAS_EVAL;
3748                         data->whilem_c = data_fake.whilem_c;
3749                     }
3750                     if (flags & SCF_DO_STCLASS)
3751                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3752                 }
3753                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3754                     min1 = 0;
3755                 if (flags & SCF_DO_SUBSTR) {
3756                     data->pos_min += min1;
3757                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3758                         data->pos_delta = SSize_t_MAX;
3759                     else
3760                         data->pos_delta += max1 - min1;
3761                     if (max1 != min1 || is_inf)
3762                         data->longest = &(data->longest_float);
3763                 }
3764                 min += min1;
3765                 if (delta == SSize_t_MAX
3766                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3767                     delta = SSize_t_MAX;
3768                 else
3769                     delta += max1 - min1;
3770                 if (flags & SCF_DO_STCLASS_OR) {
3771                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3772                     if (min1) {
3773                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3774                         flags &= ~SCF_DO_STCLASS;
3775                     }
3776                 }
3777                 else if (flags & SCF_DO_STCLASS_AND) {
3778                     if (min1) {
3779                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3780                         flags &= ~SCF_DO_STCLASS;
3781                     }
3782                     else {
3783                         /* Switch to OR mode: cache the old value of
3784                          * data->start_class */
3785                         INIT_AND_WITHP;
3786                         StructCopy(data->start_class, and_withp, regnode_ssc);
3787                         flags &= ~SCF_DO_STCLASS_AND;
3788                         StructCopy(&accum, data->start_class, regnode_ssc);
3789                         flags |= SCF_DO_STCLASS_OR;
3790                     }
3791                 }
3792
3793                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3794                         OP( startbranch ) == BRANCH )
3795                 {
3796                 /* demq.
3797
3798                    Assuming this was/is a branch we are dealing with: 'scan'
3799                    now points at the item that follows the branch sequence,
3800                    whatever it is. We now start at the beginning of the
3801                    sequence and look for subsequences of
3802
3803                    BRANCH->EXACT=>x1
3804                    BRANCH->EXACT=>x2
3805                    tail
3806
3807                    which would be constructed from a pattern like
3808                    /A|LIST|OF|WORDS/
3809
3810                    If we can find such a subsequence we need to turn the first
3811                    element into a trie and then add the subsequent branch exact
3812                    strings to the trie.
3813
3814                    We have two cases
3815
3816                      1. patterns where the whole set of branches can be
3817                         converted.
3818
3819                      2. patterns where only a subset can be converted.
3820
3821                    In case 1 we can replace the whole set with a single regop
3822                    for the trie. In case 2 we need to keep the start and end
3823                    branches so
3824
3825                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3826                      becomes BRANCH TRIE; BRANCH X;
3827
3828                   There is an additional case, that being where there is a
3829                   common prefix, which gets split out into an EXACT like node
3830                   preceding the TRIE node.
3831
3832                   If x(1..n)==tail then we can do a simple trie, if not we make
3833                   a "jump" trie, such that when we match the appropriate word
3834                   we "jump" to the appropriate tail node. Essentially we turn
3835                   a nested if into a case structure of sorts.
3836
3837                 */
3838
3839                     int made=0;
3840                     if (!re_trie_maxbuff) {
3841                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3842                         if (!SvIOK(re_trie_maxbuff))
3843                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3844                     }
3845                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3846                         regnode *cur;
3847                         regnode *first = (regnode *)NULL;
3848                         regnode *last = (regnode *)NULL;
3849                         regnode *tail = scan;
3850                         U8 trietype = 0;
3851                         U32 count=0;
3852
3853 #ifdef DEBUGGING
3854                         SV * const mysv = sv_newmortal();   /* for dumping */
3855 #endif
3856                         /* var tail is used because there may be a TAIL
3857                            regop in the way. Ie, the exacts will point to the
3858                            thing following the TAIL, but the last branch will
3859                            point at the TAIL. So we advance tail. If we
3860                            have nested (?:) we may have to move through several
3861                            tails.
3862                          */
3863
3864                         while ( OP( tail ) == TAIL ) {
3865                             /* this is the TAIL generated by (?:) */
3866                             tail = regnext( tail );
3867                         }
3868
3869
3870                         DEBUG_TRIE_COMPILE_r({
3871                             regprop(RExC_rx, mysv, tail, NULL);
3872                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3873                               (int)depth * 2 + 2, "",
3874                               "Looking for TRIE'able sequences. Tail node is: ",
3875                               SvPV_nolen_const( mysv )
3876                             );
3877                         });
3878
3879                         /*
3880
3881                             Step through the branches
3882                                 cur represents each branch,
3883                                 noper is the first thing to be matched as part
3884                                       of that branch
3885                                 noper_next is the regnext() of that node.
3886
3887                             We normally handle a case like this
3888                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3889                             support building with NOJUMPTRIE, which restricts
3890                             the trie logic to structures like /FOO|BAR/.
3891
3892                             If noper is a trieable nodetype then the branch is
3893                             a possible optimization target. If we are building
3894                             under NOJUMPTRIE then we require that noper_next is
3895                             the same as scan (our current position in the regex
3896                             program).
3897
3898                             Once we have two or more consecutive such branches
3899                             we can create a trie of the EXACT's contents and
3900                             stitch it in place into the program.
3901
3902                             If the sequence represents all of the branches in
3903                             the alternation we replace the entire thing with a
3904                             single TRIE node.
3905
3906                             Otherwise when it is a subsequence we need to
3907                             stitch it in place and replace only the relevant
3908                             branches. This means the first branch has to remain
3909                             as it is used by the alternation logic, and its
3910                             next pointer, and needs to be repointed at the item
3911                             on the branch chain following the last branch we
3912                             have optimized away.
3913
3914                             This could be either a BRANCH, in which case the
3915                             subsequence is internal, or it could be the item
3916                             following the branch sequence in which case the
3917                             subsequence is at the end (which does not
3918                             necessarily mean the first node is the start of the
3919                             alternation).
3920
3921                             TRIE_TYPE(X) is a define which maps the optype to a
3922                             trietype.
3923
3924                                 optype          |  trietype
3925                                 ----------------+-----------
3926                                 NOTHING         | NOTHING
3927                                 EXACT           | EXACT
3928                                 EXACTFU         | EXACTFU
3929                                 EXACTFU_SS      | EXACTFU
3930                                 EXACTFA         | EXACTFA
3931
3932
3933                         */
3934 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3935                        ( EXACT == (X) )   ? EXACT :        \
3936                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3937                        ( EXACTFA == (X) ) ? EXACTFA :        \
3938                        0 )
3939
3940                         /* dont use tail as the end marker for this traverse */
3941                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3942                             regnode * const noper = NEXTOPER( cur );
3943                             U8 noper_type = OP( noper );
3944                             U8 noper_trietype = TRIE_TYPE( noper_type );
3945 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3946                             regnode * const noper_next = regnext( noper );
3947                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3948                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3949 #endif
3950
3951                             DEBUG_TRIE_COMPILE_r({
3952                                 regprop(RExC_rx, mysv, cur, NULL);
3953                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3954                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3955
3956                                 regprop(RExC_rx, mysv, noper, NULL);
3957                                 PerlIO_printf( Perl_debug_log, " -> %s",
3958                                     SvPV_nolen_const(mysv));
3959
3960                                 if ( noper_next ) {
3961                                   regprop(RExC_rx, mysv, noper_next, NULL);
3962                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3963                                     SvPV_nolen_const(mysv));
3964                                 }
3965                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3966                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3967                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3968                                 );
3969                             });
3970
3971                             /* Is noper a trieable nodetype that can be merged
3972                              * with the current trie (if there is one)? */
3973                             if ( noper_trietype
3974                                   &&
3975                                   (
3976                                         ( noper_trietype == NOTHING)
3977                                         || ( trietype == NOTHING )
3978                                         || ( trietype == noper_trietype )
3979                                   )
3980 #ifdef NOJUMPTRIE
3981                                   && noper_next == tail
3982 #endif
3983                                   && count < U16_MAX)
3984                             {
3985                                 /* Handle mergable triable node Either we are
3986                                  * the first node in a new trieable sequence,
3987                                  * in which case we do some bookkeeping,
3988                                  * otherwise we update the end pointer. */
3989                                 if ( !first ) {
3990                                     first = cur;
3991                                     if ( noper_trietype == NOTHING ) {
3992 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3993                                         regnode * const noper_next = regnext( noper );
3994                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3995                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3996 #endif
3997
3998                                         if ( noper_next_trietype ) {
3999                                             trietype = noper_next_trietype;
4000                                         } else if (noper_next_type)  {
4001                                             /* a NOTHING regop is 1 regop wide.
4002                                              * We need at least two for a trie
4003                                              * so we can't merge this in */
4004                                             first = NULL;
4005                                         }
4006                                     } else {
4007                                         trietype = noper_trietype;
4008                                     }
4009                                 } else {
4010                                     if ( trietype == NOTHING )
4011                                         trietype = noper_trietype;
4012                                     last = cur;
4013                                 }
4014                                 if (first)
4015                                     count++;
4016                             } /* end handle mergable triable node */
4017                             else {
4018                                 /* handle unmergable node -
4019                                  * noper may either be a triable node which can
4020                                  * not be tried together with the current trie,
4021                                  * or a non triable node */
4022                                 if ( last ) {
4023                                     /* If last is set and trietype is not
4024                                      * NOTHING then we have found at least two
4025                                      * triable branch sequences in a row of a
4026                                      * similar trietype so we can turn them
4027                                      * into a trie. If/when we allow NOTHING to
4028                                      * start a trie sequence this condition
4029                                      * will be required, and it isn't expensive
4030                                      * so we leave it in for now. */
4031                                     if ( trietype && trietype != NOTHING )
4032                                         make_trie( pRExC_state,
4033                                                 startbranch, first, cur, tail,
4034                                                 count, trietype, depth+1 );
4035                                     last = NULL; /* note: we clear/update
4036                                                     first, trietype etc below,
4037                                                     so we dont do it here */
4038                                 }
4039                                 if ( noper_trietype
4040 #ifdef NOJUMPTRIE
4041                                      && noper_next == tail
4042 #endif
4043                                 ){
4044                                     /* noper is triable, so we can start a new
4045                                      * trie sequence */
4046                                     count = 1;
4047                                     first = cur;
4048                                     trietype = noper_trietype;
4049                                 } else if (first) {
4050                                     /* if we already saw a first but the
4051                                      * current node is not triable then we have
4052                                      * to reset the first information. */
4053                                     count = 0;
4054                                     first = NULL;
4055                                     trietype = 0;
4056                                 }
4057                             } /* end handle unmergable node */
4058                         } /* loop over branches */
4059                         DEBUG_TRIE_COMPILE_r({
4060                             regprop(RExC_rx, mysv, cur, NULL);
4061                             PerlIO_printf( Perl_debug_log,
4062                               "%*s- %s (%d) <SCAN FINISHED>\n",
4063                               (int)depth * 2 + 2,
4064                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4065
4066                         });
4067                         if ( last && trietype ) {
4068                             if ( trietype != NOTHING ) {
4069                                 /* the last branch of the sequence was part of
4070                                  * a trie, so we have to construct it here
4071                                  * outside of the loop */
4072                                 made= make_trie( pRExC_state, startbranch,
4073                                                  first, scan, tail, count,
4074                                                  trietype, depth+1 );
4075 #ifdef TRIE_STUDY_OPT
4076                                 if ( ((made == MADE_EXACT_TRIE &&
4077                                      startbranch == first)
4078                                      || ( first_non_open == first )) &&
4079                                      depth==0 ) {
4080                                     flags |= SCF_TRIE_RESTUDY;
4081                                     if ( startbranch == first
4082                                          && scan == tail )
4083                                     {
4084                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4085                                     }
4086                                 }
4087 #endif
4088                             } else {
4089                                 /* at this point we know whatever we have is a
4090                                  * NOTHING sequence/branch AND if 'startbranch'
4091                                  * is 'first' then we can turn the whole thing
4092                                  * into a NOTHING
4093                                  */
4094                                 if ( startbranch == first ) {
4095                                     regnode *opt;
4096                                     /* the entire thing is a NOTHING sequence,
4097                                      * something like this: (?:|) So we can
4098                                      * turn it into a plain NOTHING op. */
4099                                     DEBUG_TRIE_COMPILE_r({
4100                                         regprop(RExC_rx, mysv, cur, NULL);
4101                                         PerlIO_printf( Perl_debug_log,
4102                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4103                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4104
4105                                     });
4106                                     OP(startbranch)= NOTHING;
4107                                     NEXT_OFF(startbranch)= tail - startbranch;
4108                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4109                                         OP(opt)= OPTIMIZED;
4110                                 }
4111                             }
4112                         } /* end if ( last) */
4113                     } /* TRIE_MAXBUF is non zero */
4114
4115                 } /* do trie */
4116
4117             }
4118             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4119                 scan = NEXTOPER(NEXTOPER(scan));
4120             } else                      /* single branch is optimized. */
4121                 scan = NEXTOPER(scan);
4122             continue;
4123         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4124             scan_frame *newframe = NULL;
4125             I32 paren;
4126             regnode *start;
4127             regnode *end;
4128             U32 my_recursed_depth= recursed_depth;
4129
4130             if (OP(scan) != SUSPEND) {
4131                 /* set the pointer */
4132                 if (OP(scan) == GOSUB) {
4133                     paren = ARG(scan);
4134                     RExC_recurse[ARG2L(scan)] = scan;
4135                     start = RExC_open_parens[paren-1];
4136                     end   = RExC_close_parens[paren-1];
4137                 } else {
4138                     paren = 0;
4139                     start = RExC_rxi->program + 1;
4140                     end   = RExC_opend;
4141                 }
4142                 if (!recursed_depth
4143                     ||
4144                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4145                 ) {
4146                     if (!recursed_depth) {
4147                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4148                     } else {
4149                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4150                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4151                              RExC_study_chunk_recursed_bytes, U8);
4152                     }
4153                     /* we havent recursed into this paren yet, so recurse into it */
4154                     DEBUG_STUDYDATA("set:", data,depth);
4155                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4156                     my_recursed_depth= recursed_depth + 1;
4157                     Newx(newframe,1,scan_frame);
4158                 } else {
4159                     DEBUG_STUDYDATA("inf:", data,depth);
4160                     /* some form of infinite recursion, assume infinite length
4161                      * */
4162                     if (flags & SCF_DO_SUBSTR) {
4163                         scan_commit(pRExC_state, data, minlenp, is_inf);
4164                         data->longest = &(data->longest_float);
4165                     }
4166                     is_inf = is_inf_internal = 1;
4167                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4168                         ssc_anything(data->start_class);
4169                     flags &= ~SCF_DO_STCLASS;
4170                 }
4171             } else {
4172                 Newx(newframe,1,scan_frame);
4173                 paren = stopparen;
4174                 start = scan+2;
4175                 end = regnext(scan);
4176             }
4177             if (newframe) {
4178                 assert(start);
4179                 assert(end);
4180                 SAVEFREEPV(newframe);
4181                 newframe->next = regnext(scan);
4182                 newframe->last = last;
4183                 newframe->stop = stopparen;
4184                 newframe->prev = frame;
4185                 newframe->prev_recursed_depth = recursed_depth;
4186
4187                 DEBUG_STUDYDATA("frame-new:",data,depth);
4188                 DEBUG_PEEP("fnew", scan, depth);
4189
4190                 frame = newframe;
4191                 scan =  start;
4192                 stopparen = paren;
4193                 last = end;
4194                 depth = depth + 1;
4195                 recursed_depth= my_recursed_depth;
4196
4197                 continue;
4198             }
4199         }
4200         else if (OP(scan) == EXACT) {
4201             SSize_t l = STR_LEN(scan);
4202             UV uc;
4203             if (UTF) {
4204                 const U8 * const s = (U8*)STRING(scan);
4205                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4206                 l = utf8_length(s, s + l);
4207             } else {
4208                 uc = *((U8*)STRING(scan));
4209             }
4210             min += l;
4211             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4212                 /* The code below prefers earlier match for fixed
4213                    offset, later match for variable offset.  */
4214                 if (data->last_end == -1) { /* Update the start info. */
4215                     data->last_start_min = data->pos_min;
4216                     data->last_start_max = is_inf
4217                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4218                 }
4219                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4220                 if (UTF)
4221                     SvUTF8_on(data->last_found);
4222                 {
4223                     SV * const sv = data->last_found;
4224                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4225                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4226                     if (mg && mg->mg_len >= 0)
4227                         mg->mg_len += utf8_length((U8*)STRING(scan),
4228                                               (U8*)STRING(scan)+STR_LEN(scan));
4229                 }
4230                 data->last_end = data->pos_min + l;
4231                 data->pos_min += l; /* As in the first entry. */
4232                 data->flags &= ~SF_BEFORE_EOL;
4233             }
4234
4235             /* ANDing the code point leaves at most it, and not in locale, and
4236              * can't match null string */
4237             if (flags & SCF_DO_STCLASS_AND) {
4238                 ssc_cp_and(data->start_class, uc);
4239                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4240                 ssc_clear_locale(data->start_class);
4241             }
4242             else if (flags & SCF_DO_STCLASS_OR) {
4243                 ssc_add_cp(data->start_class, uc);
4244                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4245
4246                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4247                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4248             }
4249             flags &= ~SCF_DO_STCLASS;
4250         }
4251         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4252             SSize_t l = STR_LEN(scan);
4253             UV uc = *((U8*)STRING(scan));
4254             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4255                                                      separate code points */
4256
4257             /* Search for fixed substrings supports EXACT only. */
4258             if (flags & SCF_DO_SUBSTR) {
4259                 assert(data);
4260                 scan_commit(pRExC_state, data, minlenp, is_inf);
4261             }
4262             if (UTF) {
4263                 const U8 * const s = (U8 *)STRING(scan);
4264                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4265                 l = utf8_length(s, s + l);
4266             }
4267             if (unfolded_multi_char) {
4268                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4269             }
4270             min += l - min_subtract;
4271             assert (min >= 0);
4272             delta += min_subtract;
4273             if (flags & SCF_DO_SUBSTR) {
4274                 data->pos_min += l - min_subtract;
4275                 if (data->pos_min < 0) {
4276                     data->pos_min = 0;
4277                 }
4278                 data->pos_delta += min_subtract;
4279                 if (min_subtract) {
4280                     data->longest = &(data->longest_float);
4281                 }
4282             }
4283             if (OP(scan) == EXACTFL) {
4284
4285                 /* We don't know what the folds are; it could be anything. XXX
4286                  * Actually, we only support UTF-8 encoding for code points
4287                  * above Latin1, so we could know what those folds are. */
4288                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4289                                                        0,
4290                                                        UV_MAX);
4291             }
4292             else {  /* Non-locale EXACTFish */
4293                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4294                 if (flags & SCF_DO_STCLASS_AND) {
4295                     ssc_clear_locale(data->start_class);
4296                 }
4297                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4298                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4299                                                        know if anything folds
4300                                                        with this */
4301                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4302                                                            PL_fold_latin1[uc]);
4303                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4304                                                       legal under /iaa */
4305                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4306                                 EXACTF_invlist
4307                                     = add_cp_to_invlist(EXACTF_invlist,
4308                                                 LATIN_SMALL_LETTER_SHARP_S);
4309                             }
4310                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4311                                 EXACTF_invlist
4312                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4313                                 EXACTF_invlist
4314                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4315                             }
4316                         }
4317
4318                         /* We also know if there are above-Latin1 code points
4319                          * that fold to this (none legal for ASCII and /iaa) */
4320                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4321                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4322                         {
4323                             /* XXX We could know exactly what does fold to this
4324                              * if the reverse folds are loaded, as currently in
4325                              * S_regclass() */
4326                             _invlist_union(EXACTF_invlist,
4327                                            PL_AboveLatin1,
4328                                            &EXACTF_invlist);
4329                         }
4330                     }
4331                 }
4332                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4333                            know what participates in folds with this, so have
4334                            to assume anything could */
4335
4336                     /* XXX We could know exactly what does fold to this if the
4337                      * reverse folds are loaded, as currently in S_regclass().
4338                      * But we do know that under /iaa nothing in the ASCII
4339                      * range can participate */
4340                     if (OP(scan) == EXACTFA) {
4341                         _invlist_union_complement_2nd(EXACTF_invlist,
4342                                                       PL_XPosix_ptrs[_CC_ASCII],
4343                                                       &EXACTF_invlist);
4344                     }
4345                     else {
4346                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4347                                                                0, UV_MAX);
4348                     }
4349                 }
4350             }
4351             if (flags & SCF_DO_STCLASS_AND) {
4352                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4353                 ANYOF_POSIXL_ZERO(data->start_class);
4354                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4355             }
4356             else if (flags & SCF_DO_STCLASS_OR) {
4357                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4358                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4359
4360                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4361                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4362             }
4363             flags &= ~SCF_DO_STCLASS;
4364             SvREFCNT_dec(EXACTF_invlist);
4365         }
4366         else if (REGNODE_VARIES(OP(scan))) {
4367             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4368             I32 fl = 0, f = flags;
4369             regnode * const oscan = scan;
4370             regnode_ssc this_class;
4371             regnode_ssc *oclass = NULL;
4372             I32 next_is_eval = 0;
4373
4374             switch (PL_regkind[OP(scan)]) {
4375             case WHILEM:                /* End of (?:...)* . */
4376                 scan = NEXTOPER(scan);
4377                 goto finish;
4378             case PLUS:
4379                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4380                     next = NEXTOPER(scan);
4381                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4382                         mincount = 1;
4383                         maxcount = REG_INFTY;
4384                         next = regnext(scan);
4385                         scan = NEXTOPER(scan);
4386                         goto do_curly;
4387                     }
4388                 }
4389                 if (flags & SCF_DO_SUBSTR)
4390                     data->pos_min++;
4391                 min++;
4392                 /* Fall through. */
4393             case STAR:
4394                 if (flags & SCF_DO_STCLASS) {
4395                     mincount = 0;
4396                     maxcount = REG_INFTY;
4397                     next = regnext(scan);
4398                     scan = NEXTOPER(scan);
4399                     goto do_curly;
4400                 }
4401                 if (flags & SCF_DO_SUBSTR) {
4402                     scan_commit(pRExC_state, data, minlenp, is_inf);
4403                     /* Cannot extend fixed substrings */
4404                     data->longest = &(data->longest_float);
4405                 }
4406                 is_inf = is_inf_internal = 1;
4407                 scan = regnext(scan);
4408                 goto optimize_curly_tail;
4409             case CURLY:
4410                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4411                     && (scan->flags == stopparen))
4412                 {
4413                     mincount = 1;
4414                     maxcount = 1;
4415                 } else {
4416                     mincount = ARG1(scan);
4417                     maxcount = ARG2(scan);
4418                 }
4419                 next = regnext(scan);
4420                 if (OP(scan) == CURLYX) {
4421                     I32 lp = (data ? *(data->last_closep) : 0);
4422                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4423                 }
4424                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4425                 next_is_eval = (OP(scan) == EVAL);
4426               do_curly:
4427                 if (flags & SCF_DO_SUBSTR) {
4428                     if (mincount == 0)
4429                         scan_commit(pRExC_state, data, minlenp, is_inf);
4430                     /* Cannot extend fixed substrings */
4431                     pos_before = data->pos_min;
4432                 }
4433                 if (data) {
4434                     fl = data->flags;
4435                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4436                     if (is_inf)
4437                         data->flags |= SF_IS_INF;
4438                 }
4439                 if (flags & SCF_DO_STCLASS) {
4440                     ssc_init(pRExC_state, &this_class);
4441                     oclass = data->start_class;
4442                     data->start_class = &this_class;
4443                     f |= SCF_DO_STCLASS_AND;
4444                     f &= ~SCF_DO_STCLASS_OR;
4445                 }
4446                 /* Exclude from super-linear cache processing any {n,m}
4447                    regops for which the combination of input pos and regex
4448                    pos is not enough information to determine if a match
4449                    will be possible.
4450
4451                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4452                    regex pos at the \s*, the prospects for a match depend not
4453                    only on the input position but also on how many (bar\s*)
4454                    repeats into the {4,8} we are. */
4455                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4456                     f &= ~SCF_WHILEM_VISITED_POS;
4457
4458                 /* This will finish on WHILEM, setting scan, or on NULL: */
4459                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4460                                   last, data, stopparen, recursed_depth, NULL,
4461                                   (mincount == 0
4462                                    ? (f & ~SCF_DO_SUBSTR)
4463                                    : f)
4464                                   ,depth+1);
4465
4466                 if (flags & SCF_DO_STCLASS)
4467                     data->start_class = oclass;
4468                 if (mincount == 0 || minnext == 0) {
4469                     if (flags & SCF_DO_STCLASS_OR) {
4470                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4471                     }
4472                     else if (flags & SCF_DO_STCLASS_AND) {
4473                         /* Switch to OR mode: cache the old value of
4474                          * data->start_class */
4475                         INIT_AND_WITHP;
4476                         StructCopy(data->start_class, and_withp, regnode_ssc);
4477                         flags &= ~SCF_DO_STCLASS_AND;
4478                         StructCopy(&this_class, data->start_class, regnode_ssc);
4479                         flags |= SCF_DO_STCLASS_OR;
4480                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4481                     }
4482                 } else {                /* Non-zero len */
4483                     if (flags & SCF_DO_STCLASS_OR) {
4484                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4485                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4486                     }
4487                     else if (flags & SCF_DO_STCLASS_AND)
4488                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4489                     flags &= ~SCF_DO_STCLASS;
4490                 }
4491                 if (!scan)              /* It was not CURLYX, but CURLY. */
4492                     scan = next;
4493                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4494                     /* ? quantifier ok, except for (?{ ... }) */
4495                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4496                     && (minnext == 0) && (deltanext == 0)
4497                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4498                     && maxcount <= REG_INFTY/3) /* Complement check for big
4499                                                    count */
4500                 {
4501                     /* Fatal warnings may leak the regexp without this: */
4502                     SAVEFREESV(RExC_rx_sv);
4503                     ckWARNreg(RExC_parse,
4504                             "Quantifier unexpected on zero-length expression");
4505                     (void)ReREFCNT_inc(RExC_rx_sv);
4506                 }
4507
4508                 min += minnext * mincount;
4509                 is_inf_internal |= deltanext == SSize_t_MAX
4510                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4511                 is_inf |= is_inf_internal;
4512                 if (is_inf) {
4513                     delta = SSize_t_MAX;
4514                 } else {
4515                     delta += (minnext + deltanext) * maxcount
4516                              - minnext * mincount;
4517                 }
4518                 /* Try powerful optimization CURLYX => CURLYN. */
4519                 if (  OP(oscan) == CURLYX && data
4520                       && data->flags & SF_IN_PAR
4521                       && !(data->flags & SF_HAS_EVAL)
4522                       && !deltanext && minnext == 1 ) {
4523                     /* Try to optimize to CURLYN.  */
4524                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4525                     regnode * const nxt1 = nxt;
4526 #ifdef DEBUGGING
4527                     regnode *nxt2;
4528 #endif
4529
4530                     /* Skip open. */
4531                     nxt = regnext(nxt);
4532                     if (!REGNODE_SIMPLE(OP(nxt))
4533                         && !(PL_regkind[OP(nxt)] == EXACT
4534                              && STR_LEN(nxt) == 1))
4535                         goto nogo;
4536 #ifdef DEBUGGING
4537                     nxt2 = nxt;
4538 #endif
4539                     nxt = regnext(nxt);
4540                     if (OP(nxt) != CLOSE)
4541                         goto nogo;
4542                     if (RExC_open_parens) {
4543                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4544                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4545                     }
4546                     /* Now we know that nxt2 is the only contents: */
4547                     oscan->flags = (U8)ARG(nxt);
4548                     OP(oscan) = CURLYN;
4549                     OP(nxt1) = NOTHING; /* was OPEN. */
4550
4551 #ifdef DEBUGGING
4552                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4553                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4554                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4555                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4556                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4557                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4558 #endif
4559                 }
4560               nogo:
4561
4562                 /* Try optimization CURLYX => CURLYM. */
4563                 if (  OP(oscan) == CURLYX && data
4564                       && !(data->flags & SF_HAS_PAR)
4565                       && !(data->flags & SF_HAS_EVAL)
4566                       && !deltanext     /* atom is fixed width */
4567                       && minnext != 0   /* CURLYM can't handle zero width */
4568
4569                          /* Nor characters whose fold at run-time may be
4570                           * multi-character */
4571                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4572                 ) {
4573                     /* XXXX How to optimize if data == 0? */
4574                     /* Optimize to a simpler form.  */
4575                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4576                     regnode *nxt2;
4577
4578                     OP(oscan) = CURLYM;
4579                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4580                             && (OP(nxt2) != WHILEM))
4581                         nxt = nxt2;
4582                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4583                     /* Need to optimize away parenths. */
4584                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4585                         /* Set the parenth number.  */
4586                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4587
4588                         oscan->flags = (U8)ARG(nxt);
4589                         if (RExC_open_parens) {
4590                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4591                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4592                         }
4593                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4594                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4595
4596 #ifdef DEBUGGING
4597                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4598                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4599                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4600                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4601 #endif
4602 #if 0
4603                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4604                             regnode *nnxt = regnext(nxt1);
4605                             if (nnxt == nxt) {
4606                                 if (reg_off_by_arg[OP(nxt1)])
4607                                     ARG_SET(nxt1, nxt2 - nxt1);
4608                                 else if (nxt2 - nxt1 < U16_MAX)
4609                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4610                                 else
4611                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4612                             }
4613                             nxt1 = nnxt;
4614                         }
4615 #endif
4616                         /* Optimize again: */
4617                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4618                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4619                     }
4620                     else
4621                         oscan->flags = 0;
4622                 }
4623                 else if ((OP(oscan) == CURLYX)
4624                          && (flags & SCF_WHILEM_VISITED_POS)
4625                          /* See the comment on a similar expression above.
4626                             However, this time it's not a subexpression
4627                             we care about, but the expression itself. */
4628                          && (maxcount == REG_INFTY)
4629                          && data && ++data->whilem_c < 16) {
4630                     /* This stays as CURLYX, we can put the count/of pair. */
4631                     /* Find WHILEM (as in regexec.c) */
4632                     regnode *nxt = oscan + NEXT_OFF(oscan);
4633
4634                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4635                         nxt += ARG(nxt);
4636                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4637                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4638                 }
4639                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4640                     pars++;
4641                 if (flags & SCF_DO_SUBSTR) {
4642                     SV *last_str = NULL;
4643                     STRLEN last_chrs = 0;
4644                     int counted = mincount != 0;
4645
4646                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4647                                                                   string. */
4648                         SSize_t b = pos_before >= data->last_start_min
4649                             ? pos_before : data->last_start_min;
4650                         STRLEN l;
4651                         const char * const s = SvPV_const(data->last_found, l);
4652                         SSize_t old = b - data->last_start_min;
4653
4654                         if (UTF)
4655                             old = utf8_hop((U8*)s, old) - (U8*)s;
4656                         l -= old;
4657                         /* Get the added string: */
4658                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4659                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4660                                             (U8*)(s + old + l)) : l;
4661                         if (deltanext == 0 && pos_before == b) {
4662                             /* What was added is a constant string */
4663                             if (mincount > 1) {
4664
4665                                 SvGROW(last_str, (mincount * l) + 1);
4666                                 repeatcpy(SvPVX(last_str) + l,
4667                                           SvPVX_const(last_str), l,
4668                                           mincount - 1);
4669                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4670                                 /* Add additional parts. */
4671                                 SvCUR_set(data->last_found,
4672                                           SvCUR(data->last_found) - l);
4673                                 sv_catsv(data->last_found, last_str);
4674                                 {
4675                                     SV * sv = data->last_found;
4676                                     MAGIC *mg =
4677                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4678                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4679                                     if (mg && mg->mg_len >= 0)
4680                                         mg->mg_len += last_chrs * (mincount-1);
4681                                 }
4682                                 last_chrs *= mincount;
4683                                 data->last_end += l * (mincount - 1);
4684                             }
4685                         } else {
4686                             /* start offset must point into the last copy */
4687                             data->last_start_min += minnext * (mincount - 1);
4688                             data->last_start_max += is_inf ? SSize_t_MAX
4689                                 : (maxcount - 1) * (minnext + data->pos_delta);
4690                         }
4691                     }
4692                     /* It is counted once already... */
4693                     data->pos_min += minnext * (mincount - counted);
4694 #if 0
4695 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4696                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4697                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4698     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4699     (UV)mincount);
4700 if (deltanext != SSize_t_MAX)
4701 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4702     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4703           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4704 #endif
4705                     if (deltanext == SSize_t_MAX
4706                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4707                         data->pos_delta = SSize_t_MAX;
4708                     else
4709                         data->pos_delta += - counted * deltanext +
4710                         (minnext + deltanext) * maxcount - minnext * mincount;
4711                     if (mincount != maxcount) {
4712                          /* Cannot extend fixed substrings found inside
4713                             the group.  */
4714                         scan_commit(pRExC_state, data, minlenp, is_inf);
4715                         if (mincount && last_str) {
4716                             SV * const sv = data->last_found;
4717                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4718                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4719
4720                             if (mg)
4721                                 mg->mg_len = -1;
4722                             sv_setsv(sv, last_str);
4723                             data->last_end = data->pos_min;
4724                             data->last_start_min = data->pos_min - last_chrs;
4725                             data->last_start_max = is_inf
4726                                 ? SSize_t_MAX
4727                                 : data->pos_min + data->pos_delta - last_chrs;
4728                         }
4729                         data->longest = &(data->longest_float);
4730                     }
4731                     SvREFCNT_dec(last_str);
4732                 }
4733                 if (data && (fl & SF_HAS_EVAL))
4734                     data->flags |= SF_HAS_EVAL;
4735               optimize_curly_tail:
4736                 if (OP(oscan) != CURLYX) {
4737                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4738                            && NEXT_OFF(next))
4739                         NEXT_OFF(oscan) += NEXT_OFF(next);
4740                 }
4741                 continue;
4742
4743             default:
4744 #ifdef DEBUGGING
4745                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4746                                                                     OP(scan));
4747 #endif
4748             case REF:
4749             case CLUMP:
4750                 if (flags & SCF_DO_SUBSTR) {
4751                     /* Cannot expect anything... */
4752                     scan_commit(pRExC_state, data, minlenp, is_inf);
4753                     data->longest = &(data->longest_float);
4754                 }
4755                 is_inf = is_inf_internal = 1;
4756                 if (flags & SCF_DO_STCLASS_OR) {
4757                     if (OP(scan) == CLUMP) {
4758                         /* Actually is any start char, but very few code points
4759                          * aren't start characters */
4760                         ssc_match_all_cp(data->start_class);
4761                     }
4762                     else {
4763                         ssc_anything(data->start_class);
4764                     }
4765                 }
4766                 flags &= ~SCF_DO_STCLASS;
4767                 break;
4768             }
4769         }
4770         else if (OP(scan) == LNBREAK) {
4771             if (flags & SCF_DO_STCLASS) {
4772                 if (flags & SCF_DO_STCLASS_AND) {
4773                     ssc_intersection(data->start_class,
4774                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4775                     ssc_clear_locale(data->start_class);
4776                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4777                 }
4778                 else if (flags & SCF_DO_STCLASS_OR) {
4779                     ssc_union(data->start_class,
4780                               PL_XPosix_ptrs[_CC_VERTSPACE],
4781                               FALSE);
4782                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4783
4784                     /* See commit msg for
4785                      * 749e076fceedeb708a624933726e7989f2302f6a */
4786                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4787                 }
4788                 flags &= ~SCF_DO_STCLASS;
4789             }
4790             min++;
4791             delta++;    /* Because of the 2 char string cr-lf */
4792             if (flags & SCF_DO_SUBSTR) {
4793                 /* Cannot expect anything... */
4794                 scan_commit(pRExC_state, data, minlenp, is_inf);
4795                 data->pos_min += 1;
4796                 data->pos_delta += 1;
4797                 data->longest = &(data->longest_float);
4798             }
4799         }
4800         else if (REGNODE_SIMPLE(OP(scan))) {
4801
4802             if (flags & SCF_DO_SUBSTR) {
4803                 scan_commit(pRExC_state, data, minlenp, is_inf);
4804                 data->pos_min++;
4805             }
4806             min++;
4807             if (flags & SCF_DO_STCLASS) {
4808                 bool invert = 0;
4809                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4810                 U8 namedclass;
4811
4812                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4813                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4814
4815                 /* Some of the logic below assumes that switching
4816                    locale on will only add false positives. */
4817                 switch (OP(scan)) {
4818
4819                 default:
4820 #ifdef DEBUGGING
4821                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4822                                                                      OP(scan));
4823 #endif
4824                 case CANY:
4825                 case SANY:
4826                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4827                         ssc_match_all_cp(data->start_class);
4828                     break;
4829
4830                 case REG_ANY:
4831                     {
4832                         SV* REG_ANY_invlist = _new_invlist(2);
4833                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4834                                                             '\n');
4835                         if (flags & SCF_DO_STCLASS_OR) {
4836                             ssc_union(data->start_class,
4837                                       REG_ANY_invlist,
4838                                       TRUE /* TRUE => invert, hence all but \n
4839                                             */
4840                                       );
4841                         }
4842                         else if (flags & SCF_DO_STCLASS_AND) {
4843                             ssc_intersection(data->start_class,
4844                                              REG_ANY_invlist,
4845                                              TRUE  /* TRUE => invert */
4846                                              );
4847                             ssc_clear_locale(data->start_class);
4848                         }
4849                         SvREFCNT_dec_NN(REG_ANY_invlist);
4850                     }
4851                     break;
4852
4853                 case ANYOF:
4854                     if (flags & SCF_DO_STCLASS_AND)
4855                         ssc_and(pRExC_state, data->start_class,
4856                                 (regnode_charclass *) scan);
4857                     else
4858                         ssc_or(pRExC_state, data->start_class,
4859                                                           (regnode_charclass *) scan);
4860                     break;
4861
4862                 case NPOSIXL:
4863                     invert = 1;
4864                     /* FALL THROUGH */
4865
4866                 case POSIXL:
4867                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4868                     if (flags & SCF_DO_STCLASS_AND) {
4869                         bool was_there = cBOOL(
4870                                           ANYOF_POSIXL_TEST(data->start_class,
4871                                                                  namedclass));
4872                         ANYOF_POSIXL_ZERO(data->start_class);
4873                         if (was_there) {    /* Do an AND */
4874                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4875                         }
4876                         /* No individual code points can now match */
4877                         data->start_class->invlist
4878                                                 = sv_2mortal(_new_invlist(0));
4879                     }
4880                     else {
4881                         int complement = namedclass + ((invert) ? -1 : 1);
4882
4883                         assert(flags & SCF_DO_STCLASS_OR);
4884
4885                         /* If the complement of this class was already there,
4886                          * the result is that they match all code points,
4887                          * (\d + \D == everything).  Remove the classes from
4888                          * future consideration.  Locale is not relevant in
4889                          * this case */
4890                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4891                             ssc_match_all_cp(data->start_class);
4892                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4893                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4894                         }
4895                         else {  /* The usual case; just add this class to the
4896                                    existing set */
4897                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4898                         }
4899                     }
4900                     break;
4901
4902                 case NPOSIXA:   /* For these, we always know the exact set of
4903                                    what's matched */
4904                     invert = 1;
4905                     /* FALL THROUGH */
4906                 case POSIXA:
4907                     if (FLAGS(scan) == _CC_ASCII) {
4908                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4909                     }
4910                     else {
4911                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4912                                               PL_XPosix_ptrs[_CC_ASCII],
4913                                               &my_invlist);
4914                     }
4915                     goto join_posix;
4916
4917                 case NPOSIXD:
4918                 case NPOSIXU:
4919                     invert = 1;
4920                     /* FALL THROUGH */
4921                 case POSIXD:
4922                 case POSIXU:
4923                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4924
4925                     /* NPOSIXD matches all upper Latin1 code points unless the
4926                      * target string being matched is UTF-8, which is
4927                      * unknowable until match time.  Since we are going to
4928                      * invert, we want to get rid of all of them so that the
4929                      * inversion will match all */
4930                     if (OP(scan) == NPOSIXD) {
4931                         _invlist_subtract(my_invlist, PL_UpperLatin1,
4932                                           &my_invlist);
4933                     }
4934
4935                   join_posix:
4936
4937                     if (flags & SCF_DO_STCLASS_AND) {
4938                         ssc_intersection(data->start_class, my_invlist, invert);
4939                         ssc_clear_locale(data->start_class);
4940                     }
4941                     else {
4942                         assert(flags & SCF_DO_STCLASS_OR);
4943                         ssc_union(data->start_class, my_invlist, invert);
4944                     }
4945                 }
4946                 if (flags & SCF_DO_STCLASS_OR)
4947                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4948                 flags &= ~SCF_DO_STCLASS;
4949             }
4950         }
4951         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4952             data->flags |= (OP(scan) == MEOL
4953                             ? SF_BEFORE_MEOL
4954                             : SF_BEFORE_SEOL);
4955             scan_commit(pRExC_state, data, minlenp, is_inf);
4956
4957         }
4958         else if (  PL_regkind[OP(scan)] == BRANCHJ
4959                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4960                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4961                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4962             if ( OP(scan) == UNLESSM &&
4963                  scan->flags == 0 &&
4964                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4965                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4966             ) {
4967                 regnode *opt;
4968                 regnode *upto= regnext(scan);
4969                 DEBUG_PARSE_r({
4970                     SV * const mysv_val=sv_newmortal();
4971                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4972
4973                     /*DEBUG_PARSE_MSG("opfail");*/
4974                     regprop(RExC_rx, mysv_val, upto, NULL);
4975                     PerlIO_printf(Perl_debug_log,
4976                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4977                         SvPV_nolen_const(mysv_val),
4978                         (IV)REG_NODE_NUM(upto),
4979                         (IV)(upto - scan)
4980                     );
4981                 });
4982                 OP(scan) = OPFAIL;
4983                 NEXT_OFF(scan) = upto - scan;
4984                 for (opt= scan + 1; opt < upto ; opt++)
4985                     OP(opt) = OPTIMIZED;
4986                 scan= upto;
4987                 continue;
4988             }
4989             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4990                 || OP(scan) == UNLESSM )
4991             {
4992                 /* Negative Lookahead/lookbehind
4993                    In this case we can't do fixed string optimisation.
4994                 */
4995
4996                 SSize_t deltanext, minnext, fake = 0;
4997                 regnode *nscan;
4998                 regnode_ssc intrnl;
4999                 int f = 0;
5000
5001                 data_fake.flags = 0;
5002                 if (data) {
5003                     data_fake.whilem_c = data->whilem_c;
5004                     data_fake.last_closep = data->last_closep;
5005                 }
5006                 else
5007                     data_fake.last_closep = &fake;
5008                 data_fake.pos_delta = delta;
5009                 if ( flags & SCF_DO_STCLASS && !scan->flags
5010                      && OP(scan) == IFMATCH ) { /* Lookahead */
5011                     ssc_init(pRExC_state, &intrnl);
5012                     data_fake.start_class = &intrnl;
5013                     f |= SCF_DO_STCLASS_AND;
5014                 }
5015                 if (flags & SCF_WHILEM_VISITED_POS)
5016                     f |= SCF_WHILEM_VISITED_POS;
5017                 next = regnext(scan);
5018                 nscan = NEXTOPER(NEXTOPER(scan));
5019                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5020                                       last, &data_fake, stopparen,
5021                                       recursed_depth, NULL, f, depth+1);
5022                 if (scan->flags) {
5023                     if (deltanext) {
5024                         FAIL("Variable length lookbehind not implemented");
5025                     }
5026                     else if (minnext > (I32)U8_MAX) {
5027                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5028                               (UV)U8_MAX);
5029                     }
5030                     scan->flags = (U8)minnext;
5031                 }
5032                 if (data) {
5033                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5034                         pars++;
5035                     if (data_fake.flags & SF_HAS_EVAL)
5036                         data->flags |= SF_HAS_EVAL;
5037                     data->whilem_c = data_fake.whilem_c;
5038                 }
5039                 if (f & SCF_DO_STCLASS_AND) {
5040                     if (flags & SCF_DO_STCLASS_OR) {
5041                         /* OR before, AND after: ideally we would recurse with
5042                          * data_fake to get the AND applied by study of the
5043                          * remainder of the pattern, and then derecurse;
5044                          * *** HACK *** for now just treat as "no information".
5045                          * See [perl #56690].
5046                          */
5047                         ssc_init(pRExC_state, data->start_class);
5048                     }  else {
5049                         /* AND before and after: combine and continue */
5050                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5051                     }
5052                 }
5053             }
5054 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5055             else {
5056                 /* Positive Lookahead/lookbehind
5057                    In this case we can do fixed string optimisation,
5058                    but we must be careful about it. Note in the case of
5059                    lookbehind the positions will be offset by the minimum
5060                    length of the pattern, something we won't know about
5061                    until after the recurse.
5062                 */
5063                 SSize_t deltanext, fake = 0;
5064                 regnode *nscan;
5065                 regnode_ssc intrnl;
5066                 int f = 0;
5067                 /* We use SAVEFREEPV so that when the full compile
5068                     is finished perl will clean up the allocated
5069                     minlens when it's all done. This way we don't
5070                     have to worry about freeing them when we know
5071                     they wont be used, which would be a pain.
5072                  */
5073                 SSize_t *minnextp;
5074                 Newx( minnextp, 1, SSize_t );
5075                 SAVEFREEPV(minnextp);
5076
5077                 if (data) {
5078                     StructCopy(data, &data_fake, scan_data_t);
5079                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5080                         f |= SCF_DO_SUBSTR;
5081                         if (scan->flags)
5082                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5083                         data_fake.last_found=newSVsv(data->last_found);
5084                     }
5085                 }
5086                 else
5087                     data_fake.last_closep = &fake;
5088                 data_fake.flags = 0;
5089                 data_fake.pos_delta = delta;
5090                 if (is_inf)
5091                     data_fake.flags |= SF_IS_INF;
5092                 if ( flags & SCF_DO_STCLASS && !scan->flags
5093                      && OP(scan) == IFMATCH ) { /* Lookahead */
5094                     ssc_init(pRExC_state, &intrnl);
5095                     data_fake.start_class = &intrnl;
5096                     f |= SCF_DO_STCLASS_AND;
5097                 }
5098                 if (flags & SCF_WHILEM_VISITED_POS)
5099                     f |= SCF_WHILEM_VISITED_POS;
5100                 next = regnext(scan);
5101                 nscan = NEXTOPER(NEXTOPER(scan));
5102
5103                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5104                                         &deltanext, last, &data_fake,
5105                                         stopparen, recursed_depth, NULL,
5106                                         f,depth+1);
5107                 if (scan->flags) {
5108                     if (deltanext) {
5109                         FAIL("Variable length lookbehind not implemented");
5110                     }
5111                     else if (*minnextp > (I32)U8_MAX) {
5112                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5113                               (UV)U8_MAX);
5114                     }
5115                     scan->flags = (U8)*minnextp;
5116                 }
5117
5118                 *minnextp += min;
5119
5120                 if (f & SCF_DO_STCLASS_AND) {
5121                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5122                 }
5123                 if (data) {
5124                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5125                         pars++;
5126                     if (data_fake.flags & SF_HAS_EVAL)
5127                         data->flags |= SF_HAS_EVAL;
5128                     data->whilem_c = data_fake.whilem_c;
5129                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5130                         if (RExC_rx->minlen<*minnextp)
5131                             RExC_rx->minlen=*minnextp;
5132                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5133                         SvREFCNT_dec_NN(data_fake.last_found);
5134
5135                         if ( data_fake.minlen_fixed != minlenp )
5136                         {
5137                             data->offset_fixed= data_fake.offset_fixed;
5138                             data->minlen_fixed= data_fake.minlen_fixed;
5139                             data->lookbehind_fixed+= scan->flags;
5140                         }
5141                         if ( data_fake.minlen_float != minlenp )
5142                         {
5143                             data->minlen_float= data_fake.minlen_float;
5144                             data->offset_float_min=data_fake.offset_float_min;
5145                             data->offset_float_max=data_fake.offset_float_max;
5146                             data->lookbehind_float+= scan->flags;
5147                         }
5148                     }
5149                 }
5150             }
5151 #endif
5152         }
5153         else if (OP(scan) == OPEN) {
5154             if (stopparen != (I32)ARG(scan))
5155                 pars++;
5156         }
5157         else if (OP(scan) == CLOSE) {
5158             if (stopparen == (I32)ARG(scan)) {
5159                 break;
5160             }
5161             if ((I32)ARG(scan) == is_par) {
5162                 next = regnext(scan);
5163
5164                 if ( next && (OP(next) != WHILEM) && next < last)
5165                     is_par = 0;         /* Disable optimization */
5166             }
5167             if (data)
5168                 *(data->last_closep) = ARG(scan);
5169         }
5170         else if (OP(scan) == EVAL) {
5171                 if (data)
5172                     data->flags |= SF_HAS_EVAL;
5173         }
5174         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5175             if (flags & SCF_DO_SUBSTR) {
5176                 scan_commit(pRExC_state, data, minlenp, is_inf);
5177                 flags &= ~SCF_DO_SUBSTR;
5178             }
5179             if (data && OP(scan)==ACCEPT) {
5180                 data->flags |= SCF_SEEN_ACCEPT;
5181                 if (stopmin > min)
5182                     stopmin = min;
5183             }
5184         }
5185         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5186         {
5187                 if (flags & SCF_DO_SUBSTR) {
5188                     scan_commit(pRExC_state, data, minlenp, is_inf);
5189                     data->longest = &(data->longest_float);
5190                 }
5191                 is_inf = is_inf_internal = 1;
5192                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5193                     ssc_anything(data->start_class);
5194                 flags &= ~SCF_DO_STCLASS;
5195         }
5196         else if (OP(scan) == GPOS) {
5197             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5198                 !(delta || is_inf || (data && data->pos_delta)))
5199             {
5200                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5201                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5202                 if (RExC_rx->gofs < (STRLEN)min)
5203                     RExC_rx->gofs = min;
5204             } else {
5205                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5206                 RExC_rx->gofs = 0;
5207             }
5208         }
5209 #ifdef TRIE_STUDY_OPT
5210 #ifdef FULL_TRIE_STUDY
5211         else if (PL_regkind[OP(scan)] == TRIE) {
5212             /* NOTE - There is similar code to this block above for handling
5213                BRANCH nodes on the initial study.  If you change stuff here
5214                check there too. */
5215             regnode *trie_node= scan;
5216             regnode *tail= regnext(scan);
5217             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5218             SSize_t max1 = 0, min1 = SSize_t_MAX;
5219             regnode_ssc accum;
5220
5221             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5222                 /* Cannot merge strings after this. */
5223                 scan_commit(pRExC_state, data, minlenp, is_inf);
5224             }
5225             if (flags & SCF_DO_STCLASS)
5226                 ssc_init_zero(pRExC_state, &accum);
5227
5228             if (!trie->jump) {
5229                 min1= trie->minlen;
5230                 max1= trie->maxlen;
5231             } else {
5232                 const regnode *nextbranch= NULL;
5233                 U32 word;
5234
5235                 for ( word=1 ; word <= trie->wordcount ; word++)
5236                 {
5237                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5238                     regnode_ssc this_class;
5239
5240                     data_fake.flags = 0;
5241                     if (data) {
5242                         data_fake.whilem_c = data->whilem_c;
5243                         data_fake.last_closep = data->last_closep;
5244                     }
5245                     else
5246                         data_fake.last_closep = &fake;
5247                     data_fake.pos_delta = delta;
5248                     if (flags & SCF_DO_STCLASS) {
5249                         ssc_init(pRExC_state, &this_class);
5250                         data_fake.start_class = &this_class;
5251                         f = SCF_DO_STCLASS_AND;
5252                     }
5253                     if (flags & SCF_WHILEM_VISITED_POS)
5254                         f |= SCF_WHILEM_VISITED_POS;
5255
5256                     if (trie->jump[word]) {
5257                         if (!nextbranch)
5258                             nextbranch = trie_node + trie->jump[0];
5259                         scan= trie_node + trie->jump[word];
5260                         /* We go from the jump point to the branch that follows
5261                            it. Note this means we need the vestigal unused
5262                            branches even though they arent otherwise used. */
5263                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5264                             &deltanext, (regnode *)nextbranch, &data_fake,
5265                             stopparen, recursed_depth, NULL, f,depth+1);
5266                     }
5267                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5268                         nextbranch= regnext((regnode*)nextbranch);
5269
5270                     if (min1 > (SSize_t)(minnext + trie->minlen))
5271                         min1 = minnext + trie->minlen;
5272                     if (deltanext == SSize_t_MAX) {
5273                         is_inf = is_inf_internal = 1;
5274                         max1 = SSize_t_MAX;
5275                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5276                         max1 = minnext + deltanext + trie->maxlen;
5277
5278                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5279                         pars++;
5280                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5281                         if ( stopmin > min + min1)
5282                             stopmin = min + min1;
5283                         flags &= ~SCF_DO_SUBSTR;
5284                         if (data)
5285                             data->flags |= SCF_SEEN_ACCEPT;
5286                     }
5287                     if (data) {
5288                         if (data_fake.flags & SF_HAS_EVAL)
5289                             data->flags |= SF_HAS_EVAL;
5290                         data->whilem_c = data_fake.whilem_c;
5291                     }
5292                     if (flags & SCF_DO_STCLASS)
5293                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5294                 }
5295             }
5296             if (flags & SCF_DO_SUBSTR) {
5297                 data->pos_min += min1;
5298                 data->pos_delta += max1 - min1;
5299                 if (max1 != min1 || is_inf)
5300                     data->longest = &(data->longest_float);
5301             }
5302             min += min1;
5303             delta += max1 - min1;
5304             if (flags & SCF_DO_STCLASS_OR) {
5305                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5306                 if (min1) {
5307                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5308                     flags &= ~SCF_DO_STCLASS;
5309                 }
5310             }
5311             else if (flags & SCF_DO_STCLASS_AND) {
5312                 if (min1) {
5313                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5314                     flags &= ~SCF_DO_STCLASS;
5315                 }
5316                 else {
5317                     /* Switch to OR mode: cache the old value of
5318                      * data->start_class */
5319                     INIT_AND_WITHP;
5320                     StructCopy(data->start_class, and_withp, regnode_ssc);
5321                     flags &= ~SCF_DO_STCLASS_AND;
5322                     StructCopy(&accum, data->start_class, regnode_ssc);
5323                     flags |= SCF_DO_STCLASS_OR;
5324                 }
5325             }
5326             scan= tail;
5327             continue;
5328         }
5329 #else
5330         else if (PL_regkind[OP(scan)] == TRIE) {
5331             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5332             U8*bang=NULL;
5333
5334             min += trie->minlen;
5335             delta += (trie->maxlen - trie->minlen);
5336             flags &= ~SCF_DO_STCLASS; /* xxx */
5337             if (flags & SCF_DO_SUBSTR) {
5338                 /* Cannot expect anything... */
5339                 scan_commit(pRExC_state, data, minlenp, is_inf);
5340                 data->pos_min += trie->minlen;
5341                 data->pos_delta += (trie->maxlen - trie->minlen);
5342                 if (trie->maxlen != trie->minlen)
5343                     data->longest = &(data->longest_float);
5344             }
5345             if (trie->jump) /* no more substrings -- for now /grr*/
5346                flags &= ~SCF_DO_SUBSTR;
5347         }
5348 #endif /* old or new */
5349 #endif /* TRIE_STUDY_OPT */
5350
5351         /* Else: zero-length, ignore. */
5352         scan = regnext(scan);
5353     }
5354     /* If we are exiting a recursion we can unset its recursed bit
5355      * and allow ourselves to enter it again - no danger of an
5356      * infinite loop there.
5357     if (stopparen > -1 && recursed) {
5358         DEBUG_STUDYDATA("unset:", data,depth);
5359         PAREN_UNSET( recursed, stopparen);
5360     }
5361     */
5362     if (frame) {
5363         DEBUG_STUDYDATA("frame-end:",data,depth);
5364         DEBUG_PEEP("fend", scan, depth);
5365         /* restore previous context */
5366         last = frame->last;
5367         scan = frame->next;
5368         stopparen = frame->stop;
5369         recursed_depth = frame->prev_recursed_depth;
5370         depth = depth - 1;
5371
5372         frame = frame->prev;
5373         goto fake_study_recurse;
5374     }
5375
5376   finish:
5377     assert(!frame);
5378     DEBUG_STUDYDATA("pre-fin:",data,depth);
5379
5380     *scanp = scan;
5381     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5382
5383     if (flags & SCF_DO_SUBSTR && is_inf)
5384         data->pos_delta = SSize_t_MAX - data->pos_min;
5385     if (is_par > (I32)U8_MAX)
5386         is_par = 0;
5387     if (is_par && pars==1 && data) {
5388         data->flags |= SF_IN_PAR;
5389         data->flags &= ~SF_HAS_PAR;
5390     }
5391     else if (pars && data) {
5392         data->flags |= SF_HAS_PAR;
5393         data->flags &= ~SF_IN_PAR;
5394     }
5395     if (flags & SCF_DO_STCLASS_OR)
5396         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5397     if (flags & SCF_TRIE_RESTUDY)
5398         data->flags |=  SCF_TRIE_RESTUDY;
5399
5400     DEBUG_STUDYDATA("post-fin:",data,depth);
5401
5402     {
5403         SSize_t final_minlen= min < stopmin ? min : stopmin;
5404
5405         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5406             RExC_maxlen = final_minlen + delta;
5407         }
5408         return final_minlen;
5409     }
5410     /* not-reached */
5411 }
5412
5413 STATIC U32
5414 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5415 {
5416     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5417
5418     PERL_ARGS_ASSERT_ADD_DATA;
5419
5420     Renewc(RExC_rxi->data,
5421            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5422            char, struct reg_data);
5423     if(count)
5424         Renew(RExC_rxi->data->what, count + n, U8);
5425     else
5426         Newx(RExC_rxi->data->what, n, U8);
5427     RExC_rxi->data->count = count + n;
5428     Copy(s, RExC_rxi->data->what + count, n, U8);
5429     return count;
5430 }
5431
5432 /*XXX: todo make this not included in a non debugging perl */
5433 #ifndef PERL_IN_XSUB_RE
5434 void
5435 Perl_reginitcolors(pTHX)
5436 {
5437     dVAR;
5438     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5439     if (s) {
5440         char *t = savepv(s);
5441         int i = 0;
5442         PL_colors[0] = t;
5443         while (++i < 6) {
5444             t = strchr(t, '\t');
5445             if (t) {
5446                 *t = '\0';
5447                 PL_colors[i] = ++t;
5448             }
5449             else
5450                 PL_colors[i] = t = (char *)"";
5451         }
5452     } else {
5453         int i = 0;
5454         while (i < 6)
5455             PL_colors[i++] = (char *)"";
5456     }
5457     PL_colorset = 1;
5458 }
5459 #endif
5460
5461
5462 #ifdef TRIE_STUDY_OPT
5463 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5464     STMT_START {                                            \
5465         if (                                                \
5466               (data.flags & SCF_TRIE_RESTUDY)               \
5467               && ! restudied++                              \
5468         ) {                                                 \
5469             dOsomething;                                    \
5470             goto reStudy;                                   \
5471         }                                                   \
5472     } STMT_END
5473 #else
5474 #define CHECK_RESTUDY_GOTO_butfirst
5475 #endif
5476
5477 /*
5478  * pregcomp - compile a regular expression into internal code
5479  *
5480  * Decides which engine's compiler to call based on the hint currently in
5481  * scope
5482  */
5483
5484 #ifndef PERL_IN_XSUB_RE
5485
5486 /* return the currently in-scope regex engine (or the default if none)  */
5487
5488 regexp_engine const *
5489 Perl_current_re_engine(pTHX)
5490 {
5491     dVAR;
5492
5493     if (IN_PERL_COMPILETIME) {
5494         HV * const table = GvHV(PL_hintgv);
5495         SV **ptr;
5496
5497         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5498             return &PL_core_reg_engine;
5499         ptr = hv_fetchs(table, "regcomp", FALSE);
5500         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5501             return &PL_core_reg_engine;
5502         return INT2PTR(regexp_engine*,SvIV(*ptr));
5503     }
5504     else {
5505         SV *ptr;
5506         if (!PL_curcop->cop_hints_hash)
5507             return &PL_core_reg_engine;
5508         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5509         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5510             return &PL_core_reg_engine;
5511         return INT2PTR(regexp_engine*,SvIV(ptr));
5512     }
5513 }
5514
5515
5516 REGEXP *
5517 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5518 {
5519     dVAR;
5520     regexp_engine const *eng = current_re_engine();
5521     GET_RE_DEBUG_FLAGS_DECL;
5522
5523     PERL_ARGS_ASSERT_PREGCOMP;
5524
5525     /* Dispatch a request to compile a regexp to correct regexp engine. */
5526     DEBUG_COMPILE_r({
5527         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5528                         PTR2UV(eng));
5529     });
5530     return CALLREGCOMP_ENG(eng, pattern, flags);
5531 }
5532 #endif
5533
5534 /* public(ish) entry point for the perl core's own regex compiling code.
5535  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5536  * pattern rather than a list of OPs, and uses the internal engine rather
5537  * than the current one */
5538
5539 REGEXP *
5540 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5541 {
5542     SV *pat = pattern; /* defeat constness! */
5543     PERL_ARGS_ASSERT_RE_COMPILE;
5544     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5545 #ifdef PERL_IN_XSUB_RE
5546                                 &my_reg_engine,
5547 #else
5548                                 &PL_core_reg_engine,
5549 #endif
5550                                 NULL, NULL, rx_flags, 0);
5551 }
5552
5553
5554 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5555  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5556  * point to the realloced string and length.
5557  *
5558  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5559  * stuff added */
5560
5561 static void
5562 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5563                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5564 {
5565     U8 *const src = (U8*)*pat_p;
5566     U8 *dst;
5567     int n=0;
5568     STRLEN s = 0, d = 0;
5569     bool do_end = 0;
5570     GET_RE_DEBUG_FLAGS_DECL;
5571
5572     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5573         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5574
5575     Newx(dst, *plen_p * 2 + 1, U8);
5576
5577     while (s < *plen_p) {
5578         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5579             dst[d]   = src[s];
5580         else {
5581             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5582             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5583         }
5584         if (n < num_code_blocks) {
5585             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5586                 pRExC_state->code_blocks[n].start = d;
5587                 assert(dst[d] == '(');
5588                 do_end = 1;
5589             }
5590             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5591                 pRExC_state->code_blocks[n].end = d;
5592                 assert(dst[d] == ')');
5593                 do_end = 0;
5594                 n++;
5595             }
5596         }
5597         s++;
5598         d++;
5599     }
5600     dst[d] = '\0';
5601     *plen_p = d;
5602     *pat_p = (char*) dst;
5603     SAVEFREEPV(*pat_p);
5604     RExC_orig_utf8 = RExC_utf8 = 1;
5605 }
5606
5607
5608
5609 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5610  * while recording any code block indices, and handling overloading,
5611  * nested qr// objects etc.  If pat is null, it will allocate a new
5612  * string, or just return the first arg, if there's only one.
5613  *
5614  * Returns the malloced/updated pat.
5615  * patternp and pat_count is the array of SVs to be concatted;
5616  * oplist is the optional list of ops that generated the SVs;
5617  * recompile_p is a pointer to a boolean that will be set if
5618  *   the regex will need to be recompiled.
5619  * delim, if non-null is an SV that will be inserted between each element
5620  */
5621
5622 static SV*
5623 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5624                 SV *pat, SV ** const patternp, int pat_count,
5625                 OP *oplist, bool *recompile_p, SV *delim)
5626 {
5627     SV **svp;
5628     int n = 0;
5629     bool use_delim = FALSE;
5630     bool alloced = FALSE;
5631
5632     /* if we know we have at least two args, create an empty string,
5633      * then concatenate args to that. For no args, return an empty string */
5634     if (!pat && pat_count != 1) {
5635         pat = newSVpvn("", 0);
5636         SAVEFREESV(pat);
5637         alloced = TRUE;
5638     }
5639
5640     for (svp = patternp; svp < patternp + pat_count; svp++) {
5641         SV *sv;
5642         SV *rx  = NULL;
5643         STRLEN orig_patlen = 0;
5644         bool code = 0;
5645         SV *msv = use_delim ? delim : *svp;
5646         if (!msv) msv = &PL_sv_undef;
5647
5648         /* if we've got a delimiter, we go round the loop twice for each
5649          * svp slot (except the last), using the delimiter the second
5650          * time round */
5651         if (use_delim) {
5652             svp--;
5653             use_delim = FALSE;
5654         }
5655         else if (delim)
5656             use_delim = TRUE;
5657
5658         if (SvTYPE(msv) == SVt_PVAV) {
5659             /* we've encountered an interpolated array within
5660              * the pattern, e.g. /...@a..../. Expand the list of elements,
5661              * then recursively append elements.
5662              * The code in this block is based on S_pushav() */
5663
5664             AV *const av = (AV*)msv;
5665             const SSize_t maxarg = AvFILL(av) + 1;
5666             SV **array;
5667
5668             if (oplist) {
5669                 assert(oplist->op_type == OP_PADAV
5670                     || oplist->op_type == OP_RV2AV);
5671                 oplist = oplist->op_sibling;;
5672             }
5673
5674             if (SvRMAGICAL(av)) {
5675                 SSize_t i;
5676
5677                 Newx(array, maxarg, SV*);
5678                 SAVEFREEPV(array);
5679                 for (i=0; i < maxarg; i++) {
5680                     SV ** const svp = av_fetch(av, i, FALSE);
5681                     array[i] = svp ? *svp : &PL_sv_undef;
5682                 }
5683             }
5684             else
5685                 array = AvARRAY(av);
5686
5687             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5688                                 array, maxarg, NULL, recompile_p,
5689                                 /* $" */
5690                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5691
5692             continue;
5693         }
5694
5695
5696         /* we make the assumption here that each op in the list of
5697          * op_siblings maps to one SV pushed onto the stack,
5698          * except for code blocks, with have both an OP_NULL and
5699          * and OP_CONST.
5700          * This allows us to match up the list of SVs against the
5701          * list of OPs to find the next code block.
5702          *
5703          * Note that       PUSHMARK PADSV PADSV ..
5704          * is optimised to
5705          *                 PADRANGE PADSV  PADSV  ..
5706          * so the alignment still works. */
5707
5708         if (oplist) {
5709             if (oplist->op_type == OP_NULL
5710                 && (oplist->op_flags & OPf_SPECIAL))
5711             {
5712                 assert(n < pRExC_state->num_code_blocks);
5713                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5714                 pRExC_state->code_blocks[n].block = oplist;
5715                 pRExC_state->code_blocks[n].src_regex = NULL;
5716                 n++;
5717                 code = 1;
5718                 oplist = oplist->op_sibling; /* skip CONST */
5719                 assert(oplist);
5720             }
5721             oplist = oplist->op_sibling;;
5722         }
5723
5724         /* apply magic and QR overloading to arg */
5725
5726         SvGETMAGIC(msv);
5727         if (SvROK(msv) && SvAMAGIC(msv)) {
5728             SV *sv = AMG_CALLunary(msv, regexp_amg);
5729             if (sv) {
5730                 if (SvROK(sv))
5731                     sv = SvRV(sv);
5732                 if (SvTYPE(sv) != SVt_REGEXP)
5733                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5734                 msv = sv;
5735             }
5736         }
5737
5738         /* try concatenation overload ... */
5739         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5740                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5741         {
5742             sv_setsv(pat, sv);
5743             /* overloading involved: all bets are off over literal
5744              * code. Pretend we haven't seen it */
5745             pRExC_state->num_code_blocks -= n;
5746             n = 0;
5747         }
5748         else  {
5749             /* ... or failing that, try "" overload */
5750             while (SvAMAGIC(msv)
5751                     && (sv = AMG_CALLunary(msv, string_amg))
5752                     && sv != msv
5753                     &&  !(   SvROK(msv)
5754                           && SvROK(sv)
5755                           && SvRV(msv) == SvRV(sv))
5756             ) {
5757                 msv = sv;
5758                 SvGETMAGIC(msv);
5759             }
5760             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5761                 msv = SvRV(msv);
5762
5763             if (pat) {
5764                 /* this is a partially unrolled
5765                  *     sv_catsv_nomg(pat, msv);
5766                  * that allows us to adjust code block indices if
5767                  * needed */
5768                 STRLEN dlen;
5769                 char *dst = SvPV_force_nomg(pat, dlen);
5770                 orig_patlen = dlen;
5771                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5772                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5773                     sv_setpvn(pat, dst, dlen);
5774                     SvUTF8_on(pat);
5775                 }
5776                 sv_catsv_nomg(pat, msv);
5777                 rx = msv;
5778             }
5779             else
5780                 pat = msv;
5781
5782             if (code)
5783                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5784         }
5785
5786         /* extract any code blocks within any embedded qr//'s */
5787         if (rx && SvTYPE(rx) == SVt_REGEXP
5788             && RX_ENGINE((REGEXP*)rx)->op_comp)
5789         {
5790
5791             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5792             if (ri->num_code_blocks) {
5793                 int i;
5794                 /* the presence of an embedded qr// with code means
5795                  * we should always recompile: the text of the
5796                  * qr// may not have changed, but it may be a
5797                  * different closure than last time */
5798                 *recompile_p = 1;
5799                 Renew(pRExC_state->code_blocks,
5800                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5801                     struct reg_code_block);
5802                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5803
5804                 for (i=0; i < ri->num_code_blocks; i++) {
5805                     struct reg_code_block *src, *dst;
5806                     STRLEN offset =  orig_patlen
5807                         + ReANY((REGEXP *)rx)->pre_prefix;
5808                     assert(n < pRExC_state->num_code_blocks);
5809                     src = &ri->code_blocks[i];
5810                     dst = &pRExC_state->code_blocks[n];
5811                     dst->start      = src->start + offset;
5812                     dst->end        = src->end   + offset;
5813                     dst->block      = src->block;
5814                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5815                                             src->src_regex
5816                                                 ? src->src_regex
5817                                                 : (REGEXP*)rx);
5818                     n++;
5819                 }
5820             }
5821         }
5822     }
5823     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5824     if (alloced)
5825         SvSETMAGIC(pat);
5826
5827     return pat;
5828 }
5829
5830
5831
5832 /* see if there are any run-time code blocks in the pattern.
5833  * False positives are allowed */
5834
5835 static bool
5836 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5837                     char *pat, STRLEN plen)
5838 {
5839     int n = 0;
5840     STRLEN s;
5841
5842     for (s = 0; s < plen; s++) {
5843         if (n < pRExC_state->num_code_blocks
5844             && s == pRExC_state->code_blocks[n].start)
5845         {
5846             s = pRExC_state->code_blocks[n].end;
5847             n++;
5848             continue;
5849         }
5850         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5851          * positives here */
5852         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5853             (pat[s+2] == '{'
5854                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5855         )
5856             return 1;
5857     }
5858     return 0;
5859 }
5860
5861 /* Handle run-time code blocks. We will already have compiled any direct
5862  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5863  * copy of it, but with any literal code blocks blanked out and
5864  * appropriate chars escaped; then feed it into
5865  *
5866  *    eval "qr'modified_pattern'"
5867  *
5868  * For example,
5869  *
5870  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5871  *
5872  * becomes
5873  *
5874  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5875  *
5876  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5877  * and merge them with any code blocks of the original regexp.
5878  *
5879  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5880  * instead, just save the qr and return FALSE; this tells our caller that
5881  * the original pattern needs upgrading to utf8.
5882  */
5883
5884 static bool
5885 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5886     char *pat, STRLEN plen)
5887 {
5888     SV *qr;
5889
5890     GET_RE_DEBUG_FLAGS_DECL;
5891
5892     if (pRExC_state->runtime_code_qr) {
5893         /* this is the second time we've been called; this should
5894          * only happen if the main pattern got upgraded to utf8
5895          * during compilation; re-use the qr we compiled first time
5896          * round (which should be utf8 too)
5897          */
5898         qr = pRExC_state->runtime_code_qr;
5899         pRExC_state->runtime_code_qr = NULL;
5900         assert(RExC_utf8 && SvUTF8(qr));
5901     }
5902     else {
5903         int n = 0;
5904         STRLEN s;
5905         char *p, *newpat;
5906         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5907         SV *sv, *qr_ref;
5908         dSP;
5909
5910         /* determine how many extra chars we need for ' and \ escaping */
5911         for (s = 0; s < plen; s++) {
5912             if (pat[s] == '\'' || pat[s] == '\\')
5913                 newlen++;
5914         }
5915
5916         Newx(newpat, newlen, char);
5917         p = newpat;
5918         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5919
5920         for (s = 0; s < plen; s++) {
5921             if (n < pRExC_state->num_code_blocks
5922                 && s == pRExC_state->code_blocks[n].start)
5923             {
5924                 /* blank out literal code block */
5925                 assert(pat[s] == '(');
5926                 while (s <= pRExC_state->code_blocks[n].end) {
5927                     *p++ = '_';
5928                     s++;
5929                 }
5930                 s--;
5931                 n++;
5932                 continue;
5933             }
5934             if (pat[s] == '\'' || pat[s] == '\\')
5935                 *p++ = '\\';
5936             *p++ = pat[s];
5937         }
5938         *p++ = '\'';
5939         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5940             *p++ = 'x';
5941         *p++ = '\0';
5942         DEBUG_COMPILE_r({
5943             PerlIO_printf(Perl_debug_log,
5944                 "%sre-parsing pattern for runtime code:%s %s\n",
5945                 PL_colors[4],PL_colors[5],newpat);
5946         });
5947
5948         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5949         Safefree(newpat);
5950
5951         ENTER;
5952         SAVETMPS;
5953         save_re_context();
5954         PUSHSTACKi(PERLSI_REQUIRE);
5955         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5956          * parsing qr''; normally only q'' does this. It also alters
5957          * hints handling */
5958         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5959         SvREFCNT_dec_NN(sv);
5960         SPAGAIN;
5961         qr_ref = POPs;
5962         PUTBACK;
5963         {
5964             SV * const errsv = ERRSV;
5965             if (SvTRUE_NN(errsv))
5966             {
5967                 Safefree(pRExC_state->code_blocks);
5968                 /* use croak_sv ? */
5969                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5970             }
5971         }
5972         assert(SvROK(qr_ref));
5973         qr = SvRV(qr_ref);
5974         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5975         /* the leaving below frees the tmp qr_ref.
5976          * Give qr a life of its own */
5977         SvREFCNT_inc(qr);
5978         POPSTACK;
5979         FREETMPS;
5980         LEAVE;
5981
5982     }
5983
5984     if (!RExC_utf8 && SvUTF8(qr)) {
5985         /* first time through; the pattern got upgraded; save the
5986          * qr for the next time through */
5987         assert(!pRExC_state->runtime_code_qr);
5988         pRExC_state->runtime_code_qr = qr;
5989         return 0;
5990     }
5991
5992
5993     /* extract any code blocks within the returned qr//  */
5994
5995
5996     /* merge the main (r1) and run-time (r2) code blocks into one */
5997     {
5998         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5999         struct reg_code_block *new_block, *dst;
6000         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6001         int i1 = 0, i2 = 0;
6002
6003         if (!r2->num_code_blocks) /* we guessed wrong */
6004         {
6005             SvREFCNT_dec_NN(qr);
6006             return 1;
6007         }
6008
6009         Newx(new_block,
6010             r1->num_code_blocks + r2->num_code_blocks,
6011             struct reg_code_block);
6012         dst = new_block;
6013
6014         while (    i1 < r1->num_code_blocks
6015                 || i2 < r2->num_code_blocks)
6016         {
6017             struct reg_code_block *src;
6018             bool is_qr = 0;
6019
6020             if (i1 == r1->num_code_blocks) {
6021                 src = &r2->code_blocks[i2++];
6022                 is_qr = 1;
6023             }
6024             else if (i2 == r2->num_code_blocks)
6025                 src = &r1->code_blocks[i1++];
6026             else if (  r1->code_blocks[i1].start
6027                      < r2->code_blocks[i2].start)
6028             {
6029                 src = &r1->code_blocks[i1++];
6030                 assert(src->end < r2->code_blocks[i2].start);
6031             }
6032             else {
6033                 assert(  r1->code_blocks[i1].start
6034                        > r2->code_blocks[i2].start);
6035                 src = &r2->code_blocks[i2++];
6036                 is_qr = 1;
6037                 assert(src->end < r1->code_blocks[i1].start);
6038             }
6039
6040             assert(pat[src->start] == '(');
6041             assert(pat[src->end]   == ')');
6042             dst->start      = src->start;
6043             dst->end        = src->end;
6044             dst->block      = src->block;
6045             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6046                                     : src->src_regex;
6047             dst++;
6048         }
6049         r1->num_code_blocks += r2->num_code_blocks;
6050         Safefree(r1->code_blocks);
6051         r1->code_blocks = new_block;
6052     }
6053
6054     SvREFCNT_dec_NN(qr);
6055     return 1;
6056 }
6057
6058
6059 STATIC bool
6060 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6061                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6062                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6063                       STRLEN longest_length, bool eol, bool meol)
6064 {
6065     /* This is the common code for setting up the floating and fixed length
6066      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6067      * as to whether succeeded or not */
6068
6069     I32 t;
6070     SSize_t ml;
6071
6072     if (! (longest_length
6073            || (eol /* Can't have SEOL and MULTI */
6074                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6075           )
6076             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6077         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6078     {
6079         return FALSE;
6080     }
6081
6082     /* copy the information about the longest from the reg_scan_data
6083         over to the program. */
6084     if (SvUTF8(sv_longest)) {
6085         *rx_utf8 = sv_longest;
6086         *rx_substr = NULL;
6087     } else {
6088         *rx_substr = sv_longest;
6089         *rx_utf8 = NULL;
6090     }
6091     /* end_shift is how many chars that must be matched that
6092         follow this item. We calculate it ahead of time as once the
6093         lookbehind offset is added in we lose the ability to correctly
6094         calculate it.*/
6095     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6096     *rx_end_shift = ml - offset
6097         - longest_length + (SvTAIL(sv_longest) != 0)
6098         + lookbehind;
6099
6100     t = (eol/* Can't have SEOL and MULTI */
6101          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6102     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6103
6104     return TRUE;
6105 }
6106
6107 /*
6108  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6109  * regular expression into internal code.
6110  * The pattern may be passed either as:
6111  *    a list of SVs (patternp plus pat_count)
6112  *    a list of OPs (expr)
6113  * If both are passed, the SV list is used, but the OP list indicates
6114  * which SVs are actually pre-compiled code blocks
6115  *
6116  * The SVs in the list have magic and qr overloading applied to them (and
6117  * the list may be modified in-place with replacement SVs in the latter
6118  * case).
6119  *
6120  * If the pattern hasn't changed from old_re, then old_re will be
6121  * returned.
6122  *
6123  * eng is the current engine. If that engine has an op_comp method, then
6124  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6125  * do the initial concatenation of arguments and pass on to the external
6126  * engine.
6127  *
6128  * If is_bare_re is not null, set it to a boolean indicating whether the
6129  * arg list reduced (after overloading) to a single bare regex which has
6130  * been returned (i.e. /$qr/).
6131  *
6132  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6133  *
6134  * pm_flags contains the PMf_* flags, typically based on those from the
6135  * pm_flags field of the related PMOP. Currently we're only interested in
6136  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6137  *
6138  * We can't allocate space until we know how big the compiled form will be,
6139  * but we can't compile it (and thus know how big it is) until we've got a
6140  * place to put the code.  So we cheat:  we compile it twice, once with code
6141  * generation turned off and size counting turned on, and once "for real".
6142  * This also means that we don't allocate space until we are sure that the
6143  * thing really will compile successfully, and we never have to move the
6144  * code and thus invalidate pointers into it.  (Note that it has to be in
6145  * one piece because free() must be able to free it all.) [NB: not true in perl]
6146  *
6147  * Beware that the optimization-preparation code in here knows about some
6148  * of the structure of the compiled regexp.  [I'll say.]
6149  */
6150
6151 REGEXP *
6152 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6153                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6154                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6155 {
6156     dVAR;
6157     REGEXP *rx;
6158     struct regexp *r;
6159     regexp_internal *ri;
6160     STRLEN plen;
6161     char *exp;
6162     regnode *scan;
6163     I32 flags;
6164     SSize_t minlen = 0;
6165     U32 rx_flags;
6166     SV *pat;
6167     SV *code_blocksv = NULL;
6168     SV** new_patternp = patternp;
6169
6170     /* these are all flags - maybe they should be turned
6171      * into a single int with different bit masks */
6172     I32 sawlookahead = 0;
6173     I32 sawplus = 0;
6174     I32 sawopen = 0;
6175     I32 sawminmod = 0;
6176
6177     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6178     bool recompile = 0;
6179     bool runtime_code = 0;
6180     scan_data_t data;
6181     RExC_state_t RExC_state;
6182     RExC_state_t * const pRExC_state = &RExC_state;
6183 #ifdef TRIE_STUDY_OPT
6184     int restudied = 0;
6185     RExC_state_t copyRExC_state;
6186 #endif
6187     GET_RE_DEBUG_FLAGS_DECL;
6188
6189     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6190
6191     DEBUG_r(if (!PL_colorset) reginitcolors());
6192
6193 #ifndef PERL_IN_XSUB_RE
6194     /* Initialize these here instead of as-needed, as is quick and avoids
6195      * having to test them each time otherwise */
6196     if (! PL_AboveLatin1) {
6197         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6198         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6199         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6200         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6201         PL_HasMultiCharFold =
6202                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6203     }
6204 #endif
6205
6206     pRExC_state->code_blocks = NULL;
6207     pRExC_state->num_code_blocks = 0;
6208
6209     if (is_bare_re)
6210         *is_bare_re = FALSE;
6211
6212     if (expr && (expr->op_type == OP_LIST ||
6213                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6214         /* allocate code_blocks if needed */
6215         OP *o;
6216         int ncode = 0;
6217
6218         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6219             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6220                 ncode++; /* count of DO blocks */
6221         if (ncode) {
6222             pRExC_state->num_code_blocks = ncode;
6223             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6224         }
6225     }
6226
6227     if (!pat_count) {
6228         /* compile-time pattern with just OP_CONSTs and DO blocks */
6229
6230         int n;
6231         OP *o;
6232
6233         /* find how many CONSTs there are */
6234         assert(expr);
6235         n = 0;
6236         if (expr->op_type == OP_CONST)
6237             n = 1;
6238         else
6239             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6240                 if (o->op_type == OP_CONST)
6241                     n++;
6242             }
6243
6244         /* fake up an SV array */
6245
6246         assert(!new_patternp);
6247         Newx(new_patternp, n, SV*);
6248         SAVEFREEPV(new_patternp);
6249         pat_count = n;
6250
6251         n = 0;
6252         if (expr->op_type == OP_CONST)
6253             new_patternp[n] = cSVOPx_sv(expr);
6254         else
6255             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6256                 if (o->op_type == OP_CONST)
6257                     new_patternp[n++] = cSVOPo_sv;
6258             }
6259
6260     }
6261
6262     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6263         "Assembling pattern from %d elements%s\n", pat_count,
6264             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6265
6266     /* set expr to the first arg op */
6267
6268     if (pRExC_state->num_code_blocks
6269          && expr->op_type != OP_CONST)
6270     {
6271             expr = cLISTOPx(expr)->op_first;
6272             assert(   expr->op_type == OP_PUSHMARK
6273                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6274                    || expr->op_type == OP_PADRANGE);
6275             expr = expr->op_sibling;
6276     }
6277
6278     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6279                         expr, &recompile, NULL);
6280
6281     /* handle bare (possibly after overloading) regex: foo =~ $re */
6282     {
6283         SV *re = pat;
6284         if (SvROK(re))
6285             re = SvRV(re);
6286         if (SvTYPE(re) == SVt_REGEXP) {
6287             if (is_bare_re)
6288                 *is_bare_re = TRUE;
6289             SvREFCNT_inc(re);
6290             Safefree(pRExC_state->code_blocks);
6291             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6292                 "Precompiled pattern%s\n",
6293                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6294
6295             return (REGEXP*)re;
6296         }
6297     }
6298
6299     exp = SvPV_nomg(pat, plen);
6300
6301     if (!eng->op_comp) {
6302         if ((SvUTF8(pat) && IN_BYTES)
6303                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6304         {
6305             /* make a temporary copy; either to convert to bytes,
6306              * or to avoid repeating get-magic / overloaded stringify */
6307             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6308                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6309         }
6310         Safefree(pRExC_state->code_blocks);
6311         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6312     }
6313
6314     /* ignore the utf8ness if the pattern is 0 length */
6315     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6316     RExC_uni_semantics = 0;
6317     RExC_contains_locale = 0;
6318     RExC_contains_i = 0;
6319     pRExC_state->runtime_code_qr = NULL;
6320
6321     DEBUG_COMPILE_r({
6322             SV *dsv= sv_newmortal();
6323             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6324             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6325                           PL_colors[4],PL_colors[5],s);
6326         });
6327
6328   redo_first_pass:
6329     /* we jump here if we upgrade the pattern to utf8 and have to
6330      * recompile */
6331
6332     if ((pm_flags & PMf_USE_RE_EVAL)
6333                 /* this second condition covers the non-regex literal case,
6334                  * i.e.  $foo =~ '(?{})'. */
6335                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6336     )
6337         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6338
6339     /* return old regex if pattern hasn't changed */
6340     /* XXX: note in the below we have to check the flags as well as the
6341      * pattern.
6342      *
6343      * Things get a touch tricky as we have to compare the utf8 flag
6344      * independently from the compile flags.  */
6345
6346     if (   old_re
6347         && !recompile
6348         && !!RX_UTF8(old_re) == !!RExC_utf8
6349         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6350         && RX_PRECOMP(old_re)
6351         && RX_PRELEN(old_re) == plen
6352         && memEQ(RX_PRECOMP(old_re), exp, plen)
6353         && !runtime_code /* with runtime code, always recompile */ )
6354     {
6355         Safefree(pRExC_state->code_blocks);
6356         return old_re;
6357     }
6358
6359     rx_flags = orig_rx_flags;
6360
6361     if (rx_flags & PMf_FOLD) {
6362         RExC_contains_i = 1;
6363     }
6364     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6365
6366         /* Set to use unicode semantics if the pattern is in utf8 and has the
6367          * 'depends' charset specified, as it means unicode when utf8  */
6368         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6369     }
6370
6371     RExC_precomp = exp;
6372     RExC_flags = rx_flags;
6373     RExC_pm_flags = pm_flags;
6374
6375     if (runtime_code) {
6376         if (TAINTING_get && TAINT_get)
6377             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6378
6379         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6380             /* whoops, we have a non-utf8 pattern, whilst run-time code
6381              * got compiled as utf8. Try again with a utf8 pattern */
6382             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6383                                     pRExC_state->num_code_blocks);
6384             goto redo_first_pass;
6385         }
6386     }
6387     assert(!pRExC_state->runtime_code_qr);
6388
6389     RExC_sawback = 0;
6390
6391     RExC_seen = 0;
6392     RExC_maxlen = 0;
6393     RExC_in_lookbehind = 0;
6394     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6395     RExC_extralen = 0;
6396     RExC_override_recoding = 0;
6397     RExC_in_multi_char_class = 0;
6398
6399     /* First pass: determine size, legality. */
6400     RExC_parse = exp;
6401     RExC_start = exp;
6402     RExC_end = exp + plen;
6403     RExC_naughty = 0;
6404     RExC_npar = 1;
6405     RExC_nestroot = 0;
6406     RExC_size = 0L;
6407     RExC_emit = (regnode *) &RExC_emit_dummy;
6408     RExC_whilem_seen = 0;
6409     RExC_open_parens = NULL;
6410     RExC_close_parens = NULL;
6411     RExC_opend = NULL;
6412     RExC_paren_names = NULL;
6413 #ifdef DEBUGGING
6414     RExC_paren_name_list = NULL;
6415 #endif
6416     RExC_recurse = NULL;
6417     RExC_study_chunk_recursed = NULL;
6418     RExC_study_chunk_recursed_bytes= 0;
6419     RExC_recurse_count = 0;
6420     pRExC_state->code_index = 0;
6421
6422 #if 0 /* REGC() is (currently) a NOP at the first pass.
6423        * Clever compilers notice this and complain. --jhi */
6424     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6425 #endif
6426     DEBUG_PARSE_r(
6427         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6428         RExC_lastnum=0;
6429         RExC_lastparse=NULL;
6430     );
6431     /* reg may croak on us, not giving us a chance to free
6432        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6433        need it to survive as long as the regexp (qr/(?{})/).
6434        We must check that code_blocksv is not already set, because we may
6435        have jumped back to restart the sizing pass. */
6436     if (pRExC_state->code_blocks && !code_blocksv) {
6437         code_blocksv = newSV_type(SVt_PV);
6438         SAVEFREESV(code_blocksv);
6439         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6440         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6441     }
6442     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6443         /* It's possible to write a regexp in ascii that represents Unicode
6444         codepoints outside of the byte range, such as via \x{100}. If we
6445         detect such a sequence we have to convert the entire pattern to utf8
6446         and then recompile, as our sizing calculation will have been based
6447         on 1 byte == 1 character, but we will need to use utf8 to encode
6448         at least some part of the pattern, and therefore must convert the whole
6449         thing.
6450         -- dmq */
6451         if (flags & RESTART_UTF8) {
6452             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6453                                     pRExC_state->num_code_blocks);
6454             goto redo_first_pass;
6455         }
6456         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6457     }
6458     if (code_blocksv)
6459         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6460
6461     DEBUG_PARSE_r({
6462         PerlIO_printf(Perl_debug_log,
6463             "Required size %"IVdf" nodes\n"
6464             "Starting second pass (creation)\n",
6465             (IV)RExC_size);
6466         RExC_lastnum=0;
6467         RExC_lastparse=NULL;
6468     });
6469
6470     /* The first pass could have found things that force Unicode semantics */
6471     if ((RExC_utf8 || RExC_uni_semantics)
6472          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6473     {
6474         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6475     }
6476
6477     /* Small enough for pointer-storage convention?
6478        If extralen==0, this means that we will not need long jumps. */
6479     if (RExC_size >= 0x10000L && RExC_extralen)
6480         RExC_size += RExC_extralen;
6481     else
6482         RExC_extralen = 0;
6483     if (RExC_whilem_seen > 15)
6484         RExC_whilem_seen = 15;
6485
6486     /* Allocate space and zero-initialize. Note, the two step process
6487        of zeroing when in debug mode, thus anything assigned has to
6488        happen after that */
6489     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6490     r = ReANY(rx);
6491     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6492          char, regexp_internal);
6493     if ( r == NULL || ri == NULL )
6494         FAIL("Regexp out of space");
6495 #ifdef DEBUGGING
6496     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6497     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6498          char);
6499 #else
6500     /* bulk initialize base fields with 0. */
6501     Zero(ri, sizeof(regexp_internal), char);
6502 #endif
6503
6504     /* non-zero initialization begins here */
6505     RXi_SET( r, ri );
6506     r->engine= eng;
6507     r->extflags = rx_flags;
6508     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6509
6510     if (pm_flags & PMf_IS_QR) {
6511         ri->code_blocks = pRExC_state->code_blocks;
6512         ri->num_code_blocks = pRExC_state->num_code_blocks;
6513     }
6514     else
6515     {
6516         int n;
6517         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6518             if (pRExC_state->code_blocks[n].src_regex)
6519                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6520         SAVEFREEPV(pRExC_state->code_blocks);
6521     }
6522
6523     {
6524         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6525         bool has_charset = (get_regex_charset(r->extflags)
6526                                                     != REGEX_DEPENDS_CHARSET);
6527
6528         /* The caret is output if there are any defaults: if not all the STD
6529          * flags are set, or if no character set specifier is needed */
6530         bool has_default =
6531                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6532                     || ! has_charset);
6533         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6534                                                    == REG_RUN_ON_COMMENT_SEEN);
6535         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6536                             >> RXf_PMf_STD_PMMOD_SHIFT);
6537         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6538         char *p;
6539         /* Allocate for the worst case, which is all the std flags are turned
6540          * on.  If more precision is desired, we could do a population count of
6541          * the flags set.  This could be done with a small lookup table, or by
6542          * shifting, masking and adding, or even, when available, assembly
6543          * language for a machine-language population count.
6544          * We never output a minus, as all those are defaults, so are
6545          * covered by the caret */
6546         const STRLEN wraplen = plen + has_p + has_runon
6547             + has_default       /* If needs a caret */
6548
6549                 /* If needs a character set specifier */
6550             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6551             + (sizeof(STD_PAT_MODS) - 1)
6552             + (sizeof("(?:)") - 1);
6553
6554         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6555         r->xpv_len_u.xpvlenu_pv = p;
6556         if (RExC_utf8)
6557             SvFLAGS(rx) |= SVf_UTF8;
6558         *p++='('; *p++='?';
6559
6560         /* If a default, cover it using the caret */
6561         if (has_default) {
6562             *p++= DEFAULT_PAT_MOD;
6563         }
6564         if (has_charset) {
6565             STRLEN len;
6566             const char* const name = get_regex_charset_name(r->extflags, &len);
6567             Copy(name, p, len, char);
6568             p += len;
6569         }
6570         if (has_p)
6571             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6572         {
6573             char ch;
6574             while((ch = *fptr++)) {
6575                 if(reganch & 1)
6576                     *p++ = ch;
6577                 reganch >>= 1;
6578             }
6579         }
6580
6581         *p++ = ':';
6582         Copy(RExC_precomp, p, plen, char);
6583         assert ((RX_WRAPPED(rx) - p) < 16);
6584         r->pre_prefix = p - RX_WRAPPED(rx);
6585         p += plen;
6586         if (has_runon)
6587             *p++ = '\n';
6588         *p++ = ')';
6589         *p = 0;
6590         SvCUR_set(rx, p - RX_WRAPPED(rx));
6591     }
6592
6593     r->intflags = 0;
6594     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6595
6596     /* setup various meta data about recursion, this all requires
6597      * RExC_npar to be correctly set, and a bit later on we clear it */
6598     if (RExC_seen & REG_RECURSE_SEEN) {
6599         Newxz(RExC_open_parens, RExC_npar,regnode *);
6600         SAVEFREEPV(RExC_open_parens);
6601         Newxz(RExC_close_parens,RExC_npar,regnode *);
6602         SAVEFREEPV(RExC_close_parens);
6603     }
6604     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6605         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6606          * So its 1 if there are no parens. */
6607         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6608                                          ((RExC_npar & 0x07) != 0);
6609         Newx(RExC_study_chunk_recursed,
6610              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6611         SAVEFREEPV(RExC_study_chunk_recursed);
6612     }
6613
6614     /* Useful during FAIL. */
6615 #ifdef RE_TRACK_PATTERN_OFFSETS
6616     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6617     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6618                           "%s %"UVuf" bytes for offset annotations.\n",
6619                           ri->u.offsets ? "Got" : "Couldn't get",
6620                           (UV)((2*RExC_size+1) * sizeof(U32))));
6621 #endif
6622     SetProgLen(ri,RExC_size);
6623     RExC_rx_sv = rx;
6624     RExC_rx = r;
6625     RExC_rxi = ri;
6626
6627     /* Second pass: emit code. */
6628     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6629     RExC_pm_flags = pm_flags;
6630     RExC_parse = exp;
6631     RExC_end = exp + plen;
6632     RExC_naughty = 0;
6633     RExC_npar = 1;
6634     RExC_emit_start = ri->program;
6635     RExC_emit = ri->program;
6636     RExC_emit_bound = ri->program + RExC_size + 1;
6637     pRExC_state->code_index = 0;
6638
6639     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6640     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6641         ReREFCNT_dec(rx);
6642         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6643     }
6644     /* XXXX To minimize changes to RE engine we always allocate
6645        3-units-long substrs field. */
6646     Newx(r->substrs, 1, struct reg_substr_data);
6647     if (RExC_recurse_count) {
6648         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6649         SAVEFREEPV(RExC_recurse);
6650     }
6651
6652 reStudy:
6653     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6654     Zero(r->substrs, 1, struct reg_substr_data);
6655     if (RExC_study_chunk_recursed)
6656         Zero(RExC_study_chunk_recursed,
6657              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6658
6659 #ifdef TRIE_STUDY_OPT
6660     if (!restudied) {
6661         StructCopy(&zero_scan_data, &data, scan_data_t);
6662         copyRExC_state = RExC_state;
6663     } else {
6664         U32 seen=RExC_seen;
6665         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6666
6667         RExC_state = copyRExC_state;
6668         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6669             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6670         else
6671             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6672         StructCopy(&zero_scan_data, &data, scan_data_t);
6673     }
6674 #else
6675     StructCopy(&zero_scan_data, &data, scan_data_t);
6676 #endif
6677
6678     /* Dig out information for optimizations. */
6679     r->extflags = RExC_flags; /* was pm_op */
6680     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6681
6682     if (UTF)
6683         SvUTF8_on(rx);  /* Unicode in it? */
6684     ri->regstclass = NULL;
6685     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6686         r->intflags |= PREGf_NAUGHTY;
6687     scan = ri->program + 1;             /* First BRANCH. */
6688
6689     /* testing for BRANCH here tells us whether there is "must appear"
6690        data in the pattern. If there is then we can use it for optimisations */
6691     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6692                                                   */
6693         SSize_t fake;
6694         STRLEN longest_float_length, longest_fixed_length;
6695         regnode_ssc ch_class; /* pointed to by data */
6696         int stclass_flag;
6697         SSize_t last_close = 0; /* pointed to by data */
6698         regnode *first= scan;
6699         regnode *first_next= regnext(first);
6700         /*
6701          * Skip introductions and multiplicators >= 1
6702          * so that we can extract the 'meat' of the pattern that must
6703          * match in the large if() sequence following.
6704          * NOTE that EXACT is NOT covered here, as it is normally
6705          * picked up by the optimiser separately.
6706          *
6707          * This is unfortunate as the optimiser isnt handling lookahead
6708          * properly currently.
6709          *
6710          */
6711         while ((OP(first) == OPEN && (sawopen = 1)) ||
6712                /* An OR of *one* alternative - should not happen now. */
6713             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6714             /* for now we can't handle lookbehind IFMATCH*/
6715             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6716             (OP(first) == PLUS) ||
6717             (OP(first) == MINMOD) ||
6718                /* An {n,m} with n>0 */
6719             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6720             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6721         {
6722                 /*
6723                  * the only op that could be a regnode is PLUS, all the rest
6724                  * will be regnode_1 or regnode_2.
6725                  *
6726                  * (yves doesn't think this is true)
6727                  */
6728                 if (OP(first) == PLUS)
6729                     sawplus = 1;
6730                 else {
6731                     if (OP(first) == MINMOD)
6732                         sawminmod = 1;
6733                     first += regarglen[OP(first)];
6734                 }
6735                 first = NEXTOPER(first);
6736                 first_next= regnext(first);
6737         }
6738
6739         /* Starting-point info. */
6740       again:
6741         DEBUG_PEEP("first:",first,0);
6742         /* Ignore EXACT as we deal with it later. */
6743         if (PL_regkind[OP(first)] == EXACT) {
6744             if (OP(first) == EXACT)
6745                 NOOP;   /* Empty, get anchored substr later. */
6746             else
6747                 ri->regstclass = first;
6748         }
6749 #ifdef TRIE_STCLASS
6750         else if (PL_regkind[OP(first)] == TRIE &&
6751                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6752         {
6753             regnode *trie_op;
6754             /* this can happen only on restudy */
6755             if ( OP(first) == TRIE ) {
6756                 struct regnode_1 *trieop = (struct regnode_1 *)
6757                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6758                 StructCopy(first,trieop,struct regnode_1);
6759                 trie_op=(regnode *)trieop;
6760             } else {
6761                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6762                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6763                 StructCopy(first,trieop,struct regnode_charclass);
6764                 trie_op=(regnode *)trieop;
6765             }
6766             OP(trie_op)+=2;
6767             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6768             ri->regstclass = trie_op;
6769         }
6770 #endif
6771         else if (REGNODE_SIMPLE(OP(first)))
6772             ri->regstclass = first;
6773         else if (PL_regkind[OP(first)] == BOUND ||
6774                  PL_regkind[OP(first)] == NBOUND)
6775             ri->regstclass = first;
6776         else if (PL_regkind[OP(first)] == BOL) {
6777             r->intflags |= (OP(first) == MBOL
6778                            ? PREGf_ANCH_MBOL
6779                            : (OP(first) == SBOL
6780                               ? PREGf_ANCH_SBOL
6781                               : PREGf_ANCH_BOL));
6782             first = NEXTOPER(first);
6783             goto again;
6784         }
6785         else if (OP(first) == GPOS) {
6786             r->intflags |= PREGf_ANCH_GPOS;
6787             first = NEXTOPER(first);
6788             goto again;
6789         }
6790         else if ((!sawopen || !RExC_sawback) &&
6791             (OP(first) == STAR &&
6792             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6793             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6794         {
6795             /* turn .* into ^.* with an implied $*=1 */
6796             const int type =
6797                 (OP(NEXTOPER(first)) == REG_ANY)
6798                     ? PREGf_ANCH_MBOL
6799                     : PREGf_ANCH_SBOL;
6800             r->intflags |= (type | PREGf_IMPLICIT);
6801             first = NEXTOPER(first);
6802             goto again;
6803         }
6804         if (sawplus && !sawminmod && !sawlookahead
6805             && (!sawopen || !RExC_sawback)
6806             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6807             /* x+ must match at the 1st pos of run of x's */
6808             r->intflags |= PREGf_SKIP;
6809
6810         /* Scan is after the zeroth branch, first is atomic matcher. */
6811 #ifdef TRIE_STUDY_OPT
6812         DEBUG_PARSE_r(
6813             if (!restudied)
6814                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6815                               (IV)(first - scan + 1))
6816         );
6817 #else
6818         DEBUG_PARSE_r(
6819             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6820                 (IV)(first - scan + 1))
6821         );
6822 #endif
6823
6824
6825         /*
6826         * If there's something expensive in the r.e., find the
6827         * longest literal string that must appear and make it the
6828         * regmust.  Resolve ties in favor of later strings, since
6829         * the regstart check works with the beginning of the r.e.
6830         * and avoiding duplication strengthens checking.  Not a
6831         * strong reason, but sufficient in the absence of others.
6832         * [Now we resolve ties in favor of the earlier string if
6833         * it happens that c_offset_min has been invalidated, since the
6834         * earlier string may buy us something the later one won't.]
6835         */
6836
6837         data.longest_fixed = newSVpvs("");
6838         data.longest_float = newSVpvs("");
6839         data.last_found = newSVpvs("");
6840         data.longest = &(data.longest_fixed);
6841         ENTER_with_name("study_chunk");
6842         SAVEFREESV(data.longest_fixed);
6843         SAVEFREESV(data.longest_float);
6844         SAVEFREESV(data.last_found);
6845         first = scan;
6846         if (!ri->regstclass) {
6847             ssc_init(pRExC_state, &ch_class);
6848             data.start_class = &ch_class;
6849             stclass_flag = SCF_DO_STCLASS_AND;
6850         } else                          /* XXXX Check for BOUND? */
6851             stclass_flag = 0;
6852         data.last_closep = &last_close;
6853
6854         DEBUG_RExC_seen();
6855         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6856                              scan + RExC_size, /* Up to end */
6857             &data, -1, 0, NULL,
6858             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6859                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6860             0);
6861
6862
6863         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6864
6865
6866         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6867              && data.last_start_min == 0 && data.last_end > 0
6868              && !RExC_seen_zerolen
6869              && !(RExC_seen & REG_VERBARG_SEEN)
6870              && !(RExC_seen & REG_GPOS_SEEN)
6871         ){
6872             r->extflags |= RXf_CHECK_ALL;
6873         }
6874         scan_commit(pRExC_state, &data,&minlen,0);
6875
6876         longest_float_length = CHR_SVLEN(data.longest_float);
6877
6878         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6879                    && data.offset_fixed == data.offset_float_min
6880                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6881             && S_setup_longest (aTHX_ pRExC_state,
6882                                     data.longest_float,
6883                                     &(r->float_utf8),
6884                                     &(r->float_substr),
6885                                     &(r->float_end_shift),
6886                                     data.lookbehind_float,
6887                                     data.offset_float_min,
6888                                     data.minlen_float,
6889                                     longest_float_length,
6890                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6891                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6892         {
6893             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6894             r->float_max_offset = data.offset_float_max;
6895             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6896                 r->float_max_offset -= data.lookbehind_float;
6897             SvREFCNT_inc_simple_void_NN(data.longest_float);
6898         }
6899         else {
6900             r->float_substr = r->float_utf8 = NULL;
6901             longest_float_length = 0;
6902         }
6903
6904         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6905
6906         if (S_setup_longest (aTHX_ pRExC_state,
6907                                 data.longest_fixed,
6908                                 &(r->anchored_utf8),
6909                                 &(r->anchored_substr),
6910                                 &(r->anchored_end_shift),
6911                                 data.lookbehind_fixed,
6912                                 data.offset_fixed,
6913                                 data.minlen_fixed,
6914                                 longest_fixed_length,
6915                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6916                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6917         {
6918             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6919             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6920         }
6921         else {
6922             r->anchored_substr = r->anchored_utf8 = NULL;
6923             longest_fixed_length = 0;
6924         }
6925         LEAVE_with_name("study_chunk");
6926
6927         if (ri->regstclass
6928             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6929             ri->regstclass = NULL;
6930
6931         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6932             && stclass_flag
6933             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6934             && !ssc_is_anything(data.start_class))
6935         {
6936             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6937
6938             ssc_finalize(pRExC_state, data.start_class);
6939
6940             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6941             StructCopy(data.start_class,
6942                        (regnode_ssc*)RExC_rxi->data->data[n],
6943                        regnode_ssc);
6944             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6945             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6946             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6947                       regprop(r, sv, (regnode*)data.start_class, NULL);
6948                       PerlIO_printf(Perl_debug_log,
6949                                     "synthetic stclass \"%s\".\n",
6950                                     SvPVX_const(sv));});
6951             data.start_class = NULL;
6952         }
6953
6954         /* A temporary algorithm prefers floated substr to fixed one to dig
6955          * more info. */
6956         if (longest_fixed_length > longest_float_length) {
6957             r->substrs->check_ix = 0;
6958             r->check_end_shift = r->anchored_end_shift;
6959             r->check_substr = r->anchored_substr;
6960             r->check_utf8 = r->anchored_utf8;
6961             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6962             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6963                 r->intflags |= PREGf_NOSCAN;
6964         }
6965         else {
6966             r->substrs->check_ix = 1;
6967             r->check_end_shift = r->float_end_shift;
6968             r->check_substr = r->float_substr;
6969             r->check_utf8 = r->float_utf8;
6970             r->check_offset_min = r->float_min_offset;
6971             r->check_offset_max = r->float_max_offset;
6972         }
6973         if ((r->check_substr || r->check_utf8) ) {
6974             r->extflags |= RXf_USE_INTUIT;
6975             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6976                 r->extflags |= RXf_INTUIT_TAIL;
6977         }
6978         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6979
6980         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6981         if ( (STRLEN)minlen < longest_float_length )
6982             minlen= longest_float_length;
6983         if ( (STRLEN)minlen < longest_fixed_length )
6984             minlen= longest_fixed_length;
6985         */
6986     }
6987     else {
6988         /* Several toplevels. Best we can is to set minlen. */
6989         SSize_t fake;
6990         regnode_ssc ch_class;
6991         SSize_t last_close = 0;
6992
6993         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6994
6995         scan = ri->program + 1;
6996         ssc_init(pRExC_state, &ch_class);
6997         data.start_class = &ch_class;
6998         data.last_closep = &last_close;
6999
7000         DEBUG_RExC_seen();
7001         minlen = study_chunk(pRExC_state,
7002             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7003             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7004                                                       ? SCF_TRIE_DOING_RESTUDY
7005                                                       : 0),
7006             0);
7007
7008         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7009
7010         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7011                 = r->float_substr = r->float_utf8 = NULL;
7012
7013         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7014             && ! ssc_is_anything(data.start_class))
7015         {
7016             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7017
7018             ssc_finalize(pRExC_state, data.start_class);
7019
7020             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7021             StructCopy(data.start_class,
7022                        (regnode_ssc*)RExC_rxi->data->data[n],
7023                        regnode_ssc);
7024             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7025             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7026             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7027                       regprop(r, sv, (regnode*)data.start_class, NULL);
7028                       PerlIO_printf(Perl_debug_log,
7029                                     "synthetic stclass \"%s\".\n",
7030                                     SvPVX_const(sv));});
7031             data.start_class = NULL;
7032         }
7033     }
7034
7035     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7036         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7037         r->maxlen = REG_INFTY;
7038     }
7039     else {
7040         r->maxlen = RExC_maxlen;
7041     }
7042
7043     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7044        the "real" pattern. */
7045     DEBUG_OPTIMISE_r({
7046         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7047                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7048     });
7049     r->minlenret = minlen;
7050     if (r->minlen < minlen)
7051         r->minlen = minlen;
7052
7053     if (RExC_seen & REG_GPOS_SEEN)
7054         r->intflags |= PREGf_GPOS_SEEN;
7055     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7056         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7057                                                 lookbehind */
7058     if (pRExC_state->num_code_blocks)
7059         r->extflags |= RXf_EVAL_SEEN;
7060     if (RExC_seen & REG_CANY_SEEN)
7061         r->intflags |= PREGf_CANY_SEEN;
7062     if (RExC_seen & REG_VERBARG_SEEN)
7063     {
7064         r->intflags |= PREGf_VERBARG_SEEN;
7065         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7066     }
7067     if (RExC_seen & REG_CUTGROUP_SEEN)
7068         r->intflags |= PREGf_CUTGROUP_SEEN;
7069     if (pm_flags & PMf_USE_RE_EVAL)
7070         r->intflags |= PREGf_USE_RE_EVAL;
7071     if (RExC_paren_names)
7072         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7073     else
7074         RXp_PAREN_NAMES(r) = NULL;
7075
7076     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7077      * so it can be used in pp.c */
7078     if (r->intflags & PREGf_ANCH)
7079         r->extflags |= RXf_IS_ANCHORED;
7080
7081
7082     {
7083         /* this is used to identify "special" patterns that might result
7084          * in Perl NOT calling the regex engine and instead doing the match "itself",
7085          * particularly special cases in split//. By having the regex compiler
7086          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7087          * we avoid weird issues with equivalent patterns resulting in different behavior,
7088          * AND we allow non Perl engines to get the same optimizations by the setting the
7089          * flags appropriately - Yves */
7090         regnode *first = ri->program + 1;
7091         U8 fop = OP(first);
7092         regnode *next = NEXTOPER(first);
7093         U8 nop = OP(next);
7094
7095         if (PL_regkind[fop] == NOTHING && nop == END)
7096             r->extflags |= RXf_NULL;
7097         else if (PL_regkind[fop] == BOL && nop == END)
7098             r->extflags |= RXf_START_ONLY;
7099         else if (fop == PLUS
7100                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7101                  && OP(regnext(first)) == END)
7102             r->extflags |= RXf_WHITE;
7103         else if ( r->extflags & RXf_SPLIT
7104                   && fop == EXACT
7105                   && STR_LEN(first) == 1
7106                   && *(STRING(first)) == ' '
7107                   && OP(regnext(first)) == END )
7108             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7109
7110     }
7111
7112     if (RExC_contains_locale) {
7113         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7114     }
7115
7116 #ifdef DEBUGGING
7117     if (RExC_paren_names) {
7118         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7119         ri->data->data[ri->name_list_idx]
7120                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7121     } else
7122 #endif
7123         ri->name_list_idx = 0;
7124
7125     if (RExC_recurse_count) {
7126         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7127             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7128             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7129         }
7130     }
7131     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7132     /* assume we don't need to swap parens around before we match */
7133
7134     DEBUG_DUMP_r({
7135         DEBUG_RExC_seen();
7136         PerlIO_printf(Perl_debug_log,"Final program:\n");
7137         regdump(r);
7138     });
7139 #ifdef RE_TRACK_PATTERN_OFFSETS
7140     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7141         const STRLEN len = ri->u.offsets[0];
7142         STRLEN i;
7143         GET_RE_DEBUG_FLAGS_DECL;
7144         PerlIO_printf(Perl_debug_log,
7145                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7146         for (i = 1; i <= len; i++) {
7147             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7148                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7149                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7150             }
7151         PerlIO_printf(Perl_debug_log, "\n");
7152     });
7153 #endif
7154
7155 #ifdef USE_ITHREADS
7156     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7157      * by setting the regexp SV to readonly-only instead. If the
7158      * pattern's been recompiled, the USEDness should remain. */
7159     if (old_re && SvREADONLY(old_re))
7160         SvREADONLY_on(rx);
7161 #endif
7162     return rx;
7163 }
7164
7165
7166 SV*
7167 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7168                     const U32 flags)
7169 {
7170     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7171
7172     PERL_UNUSED_ARG(value);
7173
7174     if (flags & RXapif_FETCH) {
7175         return reg_named_buff_fetch(rx, key, flags);
7176     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7177         Perl_croak_no_modify();
7178         return NULL;
7179     } else if (flags & RXapif_EXISTS) {
7180         return reg_named_buff_exists(rx, key, flags)
7181             ? &PL_sv_yes
7182             : &PL_sv_no;
7183     } else if (flags & RXapif_REGNAMES) {
7184         return reg_named_buff_all(rx, flags);
7185     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7186         return reg_named_buff_scalar(rx, flags);
7187     } else {
7188         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7189         return NULL;
7190     }
7191 }
7192
7193 SV*
7194 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7195                          const U32 flags)
7196 {
7197     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7198     PERL_UNUSED_ARG(lastkey);
7199
7200     if (flags & RXapif_FIRSTKEY)
7201         return reg_named_buff_firstkey(rx, flags);
7202     else if (flags & RXapif_NEXTKEY)
7203         return reg_named_buff_nextkey(rx, flags);
7204     else {
7205         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7206                                             (int)flags);
7207         return NULL;
7208     }
7209 }
7210
7211 SV*
7212 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7213                           const U32 flags)
7214 {
7215     AV *retarray = NULL;
7216     SV *ret;
7217     struct regexp *const rx = ReANY(r);
7218
7219     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7220
7221     if (flags & RXapif_ALL)
7222         retarray=newAV();
7223
7224     if (rx && RXp_PAREN_NAMES(rx)) {
7225         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7226         if (he_str) {
7227             IV i;
7228             SV* sv_dat=HeVAL(he_str);
7229             I32 *nums=(I32*)SvPVX(sv_dat);
7230             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7231                 if ((I32)(rx->nparens) >= nums[i]
7232                     && rx->offs[nums[i]].start != -1
7233                     && rx->offs[nums[i]].end != -1)
7234                 {
7235                     ret = newSVpvs("");
7236                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7237                     if (!retarray)
7238                         return ret;
7239                 } else {
7240                     if (retarray)
7241                         ret = newSVsv(&PL_sv_undef);
7242                 }
7243                 if (retarray)
7244                     av_push(retarray, ret);
7245             }
7246             if (retarray)
7247                 return newRV_noinc(MUTABLE_SV(retarray));
7248         }
7249     }
7250     return NULL;
7251 }
7252
7253 bool
7254 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7255                            const U32 flags)
7256 {
7257     struct regexp *const rx = ReANY(r);
7258
7259     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7260
7261     if (rx && RXp_PAREN_NAMES(rx)) {
7262         if (flags & RXapif_ALL) {
7263             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7264         } else {
7265             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7266             if (sv) {
7267                 SvREFCNT_dec_NN(sv);
7268                 return TRUE;
7269             } else {
7270                 return FALSE;
7271             }
7272         }
7273     } else {
7274         return FALSE;
7275     }
7276 }
7277
7278 SV*
7279 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7280 {
7281     struct regexp *const rx = ReANY(r);
7282
7283     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7284
7285     if ( rx && RXp_PAREN_NAMES(rx) ) {
7286         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7287
7288         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7289     } else {
7290         return FALSE;
7291     }
7292 }
7293
7294 SV*
7295 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7296 {
7297     struct regexp *const rx = ReANY(r);
7298     GET_RE_DEBUG_FLAGS_DECL;
7299
7300     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7301
7302     if (rx && RXp_PAREN_NAMES(rx)) {
7303         HV *hv = RXp_PAREN_NAMES(rx);
7304         HE *temphe;
7305         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7306             IV i;
7307             IV parno = 0;
7308             SV* sv_dat = HeVAL(temphe);
7309             I32 *nums = (I32*)SvPVX(sv_dat);
7310             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7311                 if ((I32)(rx->lastparen) >= nums[i] &&
7312                     rx->offs[nums[i]].start != -1 &&
7313                     rx->offs[nums[i]].end != -1)
7314                 {
7315                     parno = nums[i];
7316                     break;
7317                 }
7318             }
7319             if (parno || flags & RXapif_ALL) {
7320                 return newSVhek(HeKEY_hek(temphe));
7321             }
7322         }
7323     }
7324     return NULL;
7325 }
7326
7327 SV*
7328 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7329 {
7330     SV *ret;
7331     AV *av;
7332     SSize_t length;
7333     struct regexp *const rx = ReANY(r);
7334
7335     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7336
7337     if (rx && RXp_PAREN_NAMES(rx)) {
7338         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7339             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7340         } else if (flags & RXapif_ONE) {
7341             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7342             av = MUTABLE_AV(SvRV(ret));
7343             length = av_tindex(av);
7344             SvREFCNT_dec_NN(ret);
7345             return newSViv(length + 1);
7346         } else {
7347             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7348                                                 (int)flags);
7349             return NULL;
7350         }
7351     }
7352     return &PL_sv_undef;
7353 }
7354
7355 SV*
7356 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7357 {
7358     struct regexp *const rx = ReANY(r);
7359     AV *av = newAV();
7360
7361     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7362
7363     if (rx && RXp_PAREN_NAMES(rx)) {
7364         HV *hv= RXp_PAREN_NAMES(rx);
7365         HE *temphe;
7366         (void)hv_iterinit(hv);
7367         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7368             IV i;
7369             IV parno = 0;
7370             SV* sv_dat = HeVAL(temphe);
7371             I32 *nums = (I32*)SvPVX(sv_dat);
7372             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7373                 if ((I32)(rx->lastparen) >= nums[i] &&
7374                     rx->offs[nums[i]].start != -1 &&
7375                     rx->offs[nums[i]].end != -1)
7376                 {
7377                     parno = nums[i];
7378                     break;
7379                 }
7380             }
7381             if (parno || flags & RXapif_ALL) {
7382                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7383             }
7384         }
7385     }
7386
7387     return newRV_noinc(MUTABLE_SV(av));
7388 }
7389
7390 void
7391 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7392                              SV * const sv)
7393 {
7394     struct regexp *const rx = ReANY(r);
7395     char *s = NULL;
7396     SSize_t i = 0;
7397     SSize_t s1, t1;
7398     I32 n = paren;
7399
7400     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7401
7402     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7403            || n == RX_BUFF_IDX_CARET_FULLMATCH
7404            || n == RX_BUFF_IDX_CARET_POSTMATCH
7405        )
7406     {
7407         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7408         if (!keepcopy) {
7409             /* on something like
7410              *    $r = qr/.../;
7411              *    /$qr/p;
7412              * the KEEPCOPY is set on the PMOP rather than the regex */
7413             if (PL_curpm && r == PM_GETRE(PL_curpm))
7414                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7415         }
7416         if (!keepcopy)
7417             goto ret_undef;
7418     }
7419
7420     if (!rx->subbeg)
7421         goto ret_undef;
7422
7423     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7424         /* no need to distinguish between them any more */
7425         n = RX_BUFF_IDX_FULLMATCH;
7426
7427     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7428         && rx->offs[0].start != -1)
7429     {
7430         /* $`, ${^PREMATCH} */
7431         i = rx->offs[0].start;
7432         s = rx->subbeg;
7433     }
7434     else
7435     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7436         && rx->offs[0].end != -1)
7437     {
7438         /* $', ${^POSTMATCH} */
7439         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7440         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7441     }
7442     else
7443     if ( 0 <= n && n <= (I32)rx->nparens &&
7444         (s1 = rx->offs[n].start) != -1 &&
7445         (t1 = rx->offs[n].end) != -1)
7446     {
7447         /* $&, ${^MATCH},  $1 ... */
7448         i = t1 - s1;
7449         s = rx->subbeg + s1 - rx->suboffset;
7450     } else {
7451         goto ret_undef;
7452     }
7453
7454     assert(s >= rx->subbeg);
7455     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7456     if (i >= 0) {
7457 #ifdef NO_TAINT_SUPPORT
7458         sv_setpvn(sv, s, i);
7459 #else
7460         const int oldtainted = TAINT_get;
7461         TAINT_NOT;
7462         sv_setpvn(sv, s, i);
7463         TAINT_set(oldtainted);
7464 #endif
7465         if ( (rx->intflags & PREGf_CANY_SEEN)
7466             ? (RXp_MATCH_UTF8(rx)
7467                         && (!i || is_utf8_string((U8*)s, i)))
7468             : (RXp_MATCH_UTF8(rx)) )
7469         {
7470             SvUTF8_on(sv);
7471         }
7472         else
7473             SvUTF8_off(sv);
7474         if (TAINTING_get) {
7475             if (RXp_MATCH_TAINTED(rx)) {
7476                 if (SvTYPE(sv) >= SVt_PVMG) {
7477                     MAGIC* const mg = SvMAGIC(sv);
7478                     MAGIC* mgt;
7479                     TAINT;
7480                     SvMAGIC_set(sv, mg->mg_moremagic);
7481                     SvTAINT(sv);
7482                     if ((mgt = SvMAGIC(sv))) {
7483                         mg->mg_moremagic = mgt;
7484                         SvMAGIC_set(sv, mg);
7485                     }
7486                 } else {
7487                     TAINT;
7488                     SvTAINT(sv);
7489                 }
7490             } else
7491                 SvTAINTED_off(sv);
7492         }
7493     } else {
7494       ret_undef:
7495         sv_setsv(sv,&PL_sv_undef);
7496         return;
7497     }
7498 }
7499
7500 void
7501 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7502                                                          SV const * const value)
7503 {
7504     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7505
7506     PERL_UNUSED_ARG(rx);
7507     PERL_UNUSED_ARG(paren);
7508     PERL_UNUSED_ARG(value);
7509
7510     if (!PL_localizing)
7511         Perl_croak_no_modify();
7512 }
7513
7514 I32
7515 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7516                               const I32 paren)
7517 {
7518     struct regexp *const rx = ReANY(r);
7519     I32 i;
7520     I32 s1, t1;
7521
7522     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7523
7524     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7525         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7526         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7527     )
7528     {
7529         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7530         if (!keepcopy) {
7531             /* on something like
7532              *    $r = qr/.../;
7533              *    /$qr/p;
7534              * the KEEPCOPY is set on the PMOP rather than the regex */
7535             if (PL_curpm && r == PM_GETRE(PL_curpm))
7536                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7537         }
7538         if (!keepcopy)
7539             goto warn_undef;
7540     }
7541
7542     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7543     switch (paren) {
7544       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7545       case RX_BUFF_IDX_PREMATCH:       /* $` */
7546         if (rx->offs[0].start != -1) {
7547                         i = rx->offs[0].start;
7548                         if (i > 0) {
7549                                 s1 = 0;
7550                                 t1 = i;
7551                                 goto getlen;
7552                         }
7553             }
7554         return 0;
7555
7556       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7557       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7558             if (rx->offs[0].end != -1) {
7559                         i = rx->sublen - rx->offs[0].end;
7560                         if (i > 0) {
7561                                 s1 = rx->offs[0].end;
7562                                 t1 = rx->sublen;
7563                                 goto getlen;
7564                         }
7565             }
7566         return 0;
7567
7568       default: /* $& / ${^MATCH}, $1, $2, ... */
7569             if (paren <= (I32)rx->nparens &&
7570             (s1 = rx->offs[paren].start) != -1 &&
7571             (t1 = rx->offs[paren].end) != -1)
7572             {
7573             i = t1 - s1;
7574             goto getlen;
7575         } else {
7576           warn_undef:
7577             if (ckWARN(WARN_UNINITIALIZED))
7578                 report_uninit((const SV *)sv);
7579             return 0;
7580         }
7581     }
7582   getlen:
7583     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7584         const char * const s = rx->subbeg - rx->suboffset + s1;
7585         const U8 *ep;
7586         STRLEN el;
7587
7588         i = t1 - s1;
7589         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7590                         i = el;
7591     }
7592     return i;
7593 }
7594
7595 SV*
7596 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7597 {
7598     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7599         PERL_UNUSED_ARG(rx);
7600         if (0)
7601             return NULL;
7602         else
7603             return newSVpvs("Regexp");
7604 }
7605
7606 /* Scans the name of a named buffer from the pattern.
7607  * If flags is REG_RSN_RETURN_NULL returns null.
7608  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7609  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7610  * to the parsed name as looked up in the RExC_paren_names hash.
7611  * If there is an error throws a vFAIL().. type exception.
7612  */
7613
7614 #define REG_RSN_RETURN_NULL    0
7615 #define REG_RSN_RETURN_NAME    1
7616 #define REG_RSN_RETURN_DATA    2
7617
7618 STATIC SV*
7619 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7620 {
7621     char *name_start = RExC_parse;
7622
7623     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7624
7625     assert (RExC_parse <= RExC_end);
7626     if (RExC_parse == RExC_end) NOOP;
7627     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7628          /* skip IDFIRST by using do...while */
7629         if (UTF)
7630             do {
7631                 RExC_parse += UTF8SKIP(RExC_parse);
7632             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7633         else
7634             do {
7635                 RExC_parse++;
7636             } while (isWORDCHAR(*RExC_parse));
7637     } else {
7638         RExC_parse++; /* so the <- from the vFAIL is after the offending
7639                          character */
7640         vFAIL("Group name must start with a non-digit word character");
7641     }
7642     if ( flags ) {
7643         SV* sv_name
7644             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7645                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7646         if ( flags == REG_RSN_RETURN_NAME)
7647             return sv_name;
7648         else if (flags==REG_RSN_RETURN_DATA) {
7649             HE *he_str = NULL;
7650             SV *sv_dat = NULL;
7651             if ( ! sv_name )      /* should not happen*/
7652                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7653             if (RExC_paren_names)
7654                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7655             if ( he_str )
7656                 sv_dat = HeVAL(he_str);
7657             if ( ! sv_dat )
7658                 vFAIL("Reference to nonexistent named group");
7659             return sv_dat;
7660         }
7661         else {
7662             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7663                        (unsigned long) flags);
7664         }
7665         assert(0); /* NOT REACHED */
7666     }
7667     return NULL;
7668 }
7669
7670 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7671     int rem=(int)(RExC_end - RExC_parse);                       \
7672     int cut;                                                    \
7673     int num;                                                    \
7674     int iscut=0;                                                \
7675     if (rem>10) {                                               \
7676         rem=10;                                                 \
7677         iscut=1;                                                \
7678     }                                                           \
7679     cut=10-rem;                                                 \
7680     if (RExC_lastparse!=RExC_parse)                             \
7681         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7682             rem, RExC_parse,                                    \
7683             cut + 4,                                            \
7684             iscut ? "..." : "<"                                 \
7685         );                                                      \
7686     else                                                        \
7687         PerlIO_printf(Perl_debug_log,"%16s","");                \
7688                                                                 \
7689     if (SIZE_ONLY)                                              \
7690        num = RExC_size + 1;                                     \
7691     else                                                        \
7692        num=REG_NODE_NUM(RExC_emit);                             \
7693     if (RExC_lastnum!=num)                                      \
7694        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7695     else                                                        \
7696        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7697     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7698         (int)((depth*2)), "",                                   \
7699         (funcname)                                              \
7700     );                                                          \
7701     RExC_lastnum=num;                                           \
7702     RExC_lastparse=RExC_parse;                                  \
7703 })
7704
7705
7706
7707 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7708     DEBUG_PARSE_MSG((funcname));                            \
7709     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7710 })
7711 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7712     DEBUG_PARSE_MSG((funcname));                            \
7713     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7714 })
7715
7716 /* This section of code defines the inversion list object and its methods.  The
7717  * interfaces are highly subject to change, so as much as possible is static to
7718  * this file.  An inversion list is here implemented as a malloc'd C UV array
7719  * as an SVt_INVLIST scalar.
7720  *
7721  * An inversion list for Unicode is an array of code points, sorted by ordinal
7722  * number.  The zeroth element is the first code point in the list.  The 1th
7723  * element is the first element beyond that not in the list.  In other words,
7724  * the first range is
7725  *  invlist[0]..(invlist[1]-1)
7726  * The other ranges follow.  Thus every element whose index is divisible by two
7727  * marks the beginning of a range that is in the list, and every element not
7728  * divisible by two marks the beginning of a range not in the list.  A single
7729  * element inversion list that contains the single code point N generally
7730  * consists of two elements
7731  *  invlist[0] == N
7732  *  invlist[1] == N+1
7733  * (The exception is when N is the highest representable value on the
7734  * machine, in which case the list containing just it would be a single
7735  * element, itself.  By extension, if the last range in the list extends to
7736  * infinity, then the first element of that range will be in the inversion list
7737  * at a position that is divisible by two, and is the final element in the
7738  * list.)
7739  * Taking the complement (inverting) an inversion list is quite simple, if the
7740  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7741  * This implementation reserves an element at the beginning of each inversion
7742  * list to always contain 0; there is an additional flag in the header which
7743  * indicates if the list begins at the 0, or is offset to begin at the next
7744  * element.
7745  *
7746  * More about inversion lists can be found in "Unicode Demystified"
7747  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7748  * More will be coming when functionality is added later.
7749  *
7750  * The inversion list data structure is currently implemented as an SV pointing
7751  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7752  * array of UV whose memory management is automatically handled by the existing
7753  * facilities for SV's.
7754  *
7755  * Some of the methods should always be private to the implementation, and some
7756  * should eventually be made public */
7757
7758 /* The header definitions are in F<inline_invlist.c> */
7759
7760 PERL_STATIC_INLINE UV*
7761 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7762 {
7763     /* Returns a pointer to the first element in the inversion list's array.
7764      * This is called upon initialization of an inversion list.  Where the
7765      * array begins depends on whether the list has the code point U+0000 in it
7766      * or not.  The other parameter tells it whether the code that follows this
7767      * call is about to put a 0 in the inversion list or not.  The first
7768      * element is either the element reserved for 0, if TRUE, or the element
7769      * after it, if FALSE */
7770
7771     bool* offset = get_invlist_offset_addr(invlist);
7772     UV* zero_addr = (UV *) SvPVX(invlist);
7773
7774     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7775
7776     /* Must be empty */
7777     assert(! _invlist_len(invlist));
7778
7779     *zero_addr = 0;
7780
7781     /* 1^1 = 0; 1^0 = 1 */
7782     *offset = 1 ^ will_have_0;
7783     return zero_addr + *offset;
7784 }
7785
7786 PERL_STATIC_INLINE UV*
7787 S_invlist_array(pTHX_ SV* const invlist)
7788 {
7789     /* Returns the pointer to the inversion list's array.  Every time the
7790      * length changes, this needs to be called in case malloc or realloc moved
7791      * it */
7792
7793     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7794
7795     /* Must not be empty.  If these fail, you probably didn't check for <len>
7796      * being non-zero before trying to get the array */
7797     assert(_invlist_len(invlist));
7798
7799     /* The very first element always contains zero, The array begins either
7800      * there, or if the inversion list is offset, at the element after it.
7801      * The offset header field determines which; it contains 0 or 1 to indicate
7802      * how much additionally to add */
7803     assert(0 == *(SvPVX(invlist)));
7804     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7805 }
7806
7807 PERL_STATIC_INLINE void
7808 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7809 {
7810     /* Sets the current number of elements stored in the inversion list.
7811      * Updates SvCUR correspondingly */
7812
7813     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7814
7815     assert(SvTYPE(invlist) == SVt_INVLIST);
7816
7817     SvCUR_set(invlist,
7818               (len == 0)
7819                ? 0
7820                : TO_INTERNAL_SIZE(len + offset));
7821     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7822 }
7823
7824 PERL_STATIC_INLINE IV*
7825 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7826 {
7827     /* Return the address of the IV that is reserved to hold the cached index
7828      * */
7829
7830     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7831
7832     assert(SvTYPE(invlist) == SVt_INVLIST);
7833
7834     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7835 }
7836
7837 PERL_STATIC_INLINE IV
7838 S_invlist_previous_index(pTHX_ SV* const invlist)
7839 {
7840     /* Returns cached index of previous search */
7841
7842     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7843
7844     return *get_invlist_previous_index_addr(invlist);
7845 }
7846
7847 PERL_STATIC_INLINE void
7848 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7849 {
7850     /* Caches <index> for later retrieval */
7851
7852     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7853
7854     assert(index == 0 || index < (int) _invlist_len(invlist));
7855
7856     *get_invlist_previous_index_addr(invlist) = index;
7857 }
7858
7859 PERL_STATIC_INLINE UV
7860 S_invlist_max(pTHX_ SV* const invlist)
7861 {
7862     /* Returns the maximum number of elements storable in the inversion list's
7863      * array, without having to realloc() */
7864
7865     PERL_ARGS_ASSERT_INVLIST_MAX;
7866
7867     assert(SvTYPE(invlist) == SVt_INVLIST);
7868
7869     /* Assumes worst case, in which the 0 element is not counted in the
7870      * inversion list, so subtracts 1 for that */
7871     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7872            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7873            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7874 }
7875
7876 #ifndef PERL_IN_XSUB_RE
7877 SV*
7878 Perl__new_invlist(pTHX_ IV initial_size)
7879 {
7880
7881     /* Return a pointer to a newly constructed inversion list, with enough
7882      * space to store 'initial_size' elements.  If that number is negative, a
7883      * system default is used instead */
7884
7885     SV* new_list;
7886
7887     if (initial_size < 0) {
7888         initial_size = 10;
7889     }
7890
7891     /* Allocate the initial space */
7892     new_list = newSV_type(SVt_INVLIST);
7893
7894     /* First 1 is in case the zero element isn't in the list; second 1 is for
7895      * trailing NUL */
7896     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7897     invlist_set_len(new_list, 0, 0);
7898
7899     /* Force iterinit() to be used to get iteration to work */
7900     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7901
7902     *get_invlist_previous_index_addr(new_list) = 0;
7903
7904     return new_list;
7905 }
7906
7907 SV*
7908 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7909 {
7910     /* Return a pointer to a newly constructed inversion list, initialized to
7911      * point to <list>, which has to be in the exact correct inversion list
7912      * form, including internal fields.  Thus this is a dangerous routine that
7913      * should not be used in the wrong hands.  The passed in 'list' contains
7914      * several header fields at the beginning that are not part of the
7915      * inversion list body proper */
7916
7917     const STRLEN length = (STRLEN) list[0];
7918     const UV version_id =          list[1];
7919     const bool offset   =    cBOOL(list[2]);
7920 #define HEADER_LENGTH 3
7921     /* If any of the above changes in any way, you must change HEADER_LENGTH
7922      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7923      *      perl -E 'say int(rand 2**31-1)'
7924      */
7925 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7926                                         data structure type, so that one being
7927                                         passed in can be validated to be an
7928                                         inversion list of the correct vintage.
7929                                        */
7930
7931     SV* invlist = newSV_type(SVt_INVLIST);
7932
7933     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7934
7935     if (version_id != INVLIST_VERSION_ID) {
7936         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7937     }
7938
7939     /* The generated array passed in includes header elements that aren't part
7940      * of the list proper, so start it just after them */
7941     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7942
7943     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7944                                shouldn't touch it */
7945
7946     *(get_invlist_offset_addr(invlist)) = offset;
7947
7948     /* The 'length' passed to us is the physical number of elements in the
7949      * inversion list.  But if there is an offset the logical number is one
7950      * less than that */
7951     invlist_set_len(invlist, length  - offset, offset);
7952
7953     invlist_set_previous_index(invlist, 0);
7954
7955     /* Initialize the iteration pointer. */
7956     invlist_iterfinish(invlist);
7957
7958     SvREADONLY_on(invlist);
7959
7960     return invlist;
7961 }
7962 #endif /* ifndef PERL_IN_XSUB_RE */
7963
7964 STATIC void
7965 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7966 {
7967     /* Grow the maximum size of an inversion list */
7968
7969     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7970
7971     assert(SvTYPE(invlist) == SVt_INVLIST);
7972
7973     /* Add one to account for the zero element at the beginning which may not
7974      * be counted by the calling parameters */
7975     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7976 }
7977
7978 PERL_STATIC_INLINE void
7979 S_invlist_trim(pTHX_ SV* const invlist)
7980 {
7981     PERL_ARGS_ASSERT_INVLIST_TRIM;
7982
7983     assert(SvTYPE(invlist) == SVt_INVLIST);
7984
7985     /* Change the length of the inversion list to how many entries it currently
7986      * has */
7987     SvPV_shrink_to_cur((SV *) invlist);
7988 }
7989
7990 STATIC void
7991 S__append_range_to_invlist(pTHX_ SV* const invlist,
7992                                  const UV start, const UV end)
7993 {
7994    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7995     * the end of the inversion list.  The range must be above any existing
7996     * ones. */
7997
7998     UV* array;
7999     UV max = invlist_max(invlist);
8000     UV len = _invlist_len(invlist);
8001     bool offset;
8002
8003     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8004
8005     if (len == 0) { /* Empty lists must be initialized */
8006         offset = start != 0;
8007         array = _invlist_array_init(invlist, ! offset);
8008     }
8009     else {
8010         /* Here, the existing list is non-empty. The current max entry in the
8011          * list is generally the first value not in the set, except when the
8012          * set extends to the end of permissible values, in which case it is
8013          * the first entry in that final set, and so this call is an attempt to
8014          * append out-of-order */
8015
8016         UV final_element = len - 1;
8017         array = invlist_array(invlist);
8018         if (array[final_element] > start
8019             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8020         {
8021             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8022                      array[final_element], start,
8023                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8024         }
8025
8026         /* Here, it is a legal append.  If the new range begins with the first
8027          * value not in the set, it is extending the set, so the new first
8028          * value not in the set is one greater than the newly extended range.
8029          * */
8030         offset = *get_invlist_offset_addr(invlist);
8031         if (array[final_element] == start) {
8032             if (end != UV_MAX) {
8033                 array[final_element] = end + 1;
8034             }
8035             else {
8036                 /* But if the end is the maximum representable on the machine,
8037                  * just let the range that this would extend to have no end */
8038                 invlist_set_len(invlist, len - 1, offset);
8039             }
8040             return;
8041         }
8042     }
8043
8044     /* Here the new range doesn't extend any existing set.  Add it */
8045
8046     len += 2;   /* Includes an element each for the start and end of range */
8047
8048     /* If wll overflow the existing space, extend, which may cause the array to
8049      * be moved */
8050     if (max < len) {
8051         invlist_extend(invlist, len);
8052
8053         /* Have to set len here to avoid assert failure in invlist_array() */
8054         invlist_set_len(invlist, len, offset);
8055
8056         array = invlist_array(invlist);
8057     }
8058     else {
8059         invlist_set_len(invlist, len, offset);
8060     }
8061
8062     /* The next item on the list starts the range, the one after that is
8063      * one past the new range.  */
8064     array[len - 2] = start;
8065     if (end != UV_MAX) {
8066         array[len - 1] = end + 1;
8067     }
8068     else {
8069         /* But if the end is the maximum representable on the machine, just let
8070          * the range have no end */
8071         invlist_set_len(invlist, len - 1, offset);
8072     }
8073 }
8074
8075 #ifndef PERL_IN_XSUB_RE
8076
8077 IV
8078 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8079 {
8080     /* Searches the inversion list for the entry that contains the input code
8081      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8082      * return value is the index into the list's array of the range that
8083      * contains <cp> */
8084
8085     IV low = 0;
8086     IV mid;
8087     IV high = _invlist_len(invlist);
8088     const IV highest_element = high - 1;
8089     const UV* array;
8090
8091     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8092
8093     /* If list is empty, return failure. */
8094     if (high == 0) {
8095         return -1;
8096     }
8097
8098     /* (We can't get the array unless we know the list is non-empty) */
8099     array = invlist_array(invlist);
8100
8101     mid = invlist_previous_index(invlist);
8102     assert(mid >=0 && mid <= highest_element);
8103
8104     /* <mid> contains the cache of the result of the previous call to this
8105      * function (0 the first time).  See if this call is for the same result,
8106      * or if it is for mid-1.  This is under the theory that calls to this
8107      * function will often be for related code points that are near each other.
8108      * And benchmarks show that caching gives better results.  We also test
8109      * here if the code point is within the bounds of the list.  These tests
8110      * replace others that would have had to be made anyway to make sure that
8111      * the array bounds were not exceeded, and these give us extra information
8112      * at the same time */
8113     if (cp >= array[mid]) {
8114         if (cp >= array[highest_element]) {
8115             return highest_element;
8116         }
8117
8118         /* Here, array[mid] <= cp < array[highest_element].  This means that
8119          * the final element is not the answer, so can exclude it; it also
8120          * means that <mid> is not the final element, so can refer to 'mid + 1'
8121          * safely */
8122         if (cp < array[mid + 1]) {
8123             return mid;
8124         }
8125         high--;
8126         low = mid + 1;
8127     }
8128     else { /* cp < aray[mid] */
8129         if (cp < array[0]) { /* Fail if outside the array */
8130             return -1;
8131         }
8132         high = mid;
8133         if (cp >= array[mid - 1]) {
8134             goto found_entry;
8135         }
8136     }
8137
8138     /* Binary search.  What we are looking for is <i> such that
8139      *  array[i] <= cp < array[i+1]
8140      * The loop below converges on the i+1.  Note that there may not be an
8141      * (i+1)th element in the array, and things work nonetheless */
8142     while (low < high) {
8143         mid = (low + high) / 2;
8144         assert(mid <= highest_element);
8145         if (array[mid] <= cp) { /* cp >= array[mid] */
8146             low = mid + 1;
8147
8148             /* We could do this extra test to exit the loop early.
8149             if (cp < array[low]) {
8150                 return mid;
8151             }
8152             */
8153         }
8154         else { /* cp < array[mid] */
8155             high = mid;
8156         }
8157     }
8158
8159   found_entry:
8160     high--;
8161     invlist_set_previous_index(invlist, high);
8162     return high;
8163 }
8164
8165 void
8166 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8167                                     const UV start, const UV end, U8* swatch)
8168 {
8169     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8170      * but is used when the swash has an inversion list.  This makes this much
8171      * faster, as it uses a binary search instead of a linear one.  This is
8172      * intimately tied to that function, and perhaps should be in utf8.c,
8173      * except it is intimately tied to inversion lists as well.  It assumes
8174      * that <swatch> is all 0's on input */
8175
8176     UV current = start;
8177     const IV len = _invlist_len(invlist);
8178     IV i;
8179     const UV * array;
8180
8181     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8182
8183     if (len == 0) { /* Empty inversion list */
8184         return;
8185     }
8186
8187     array = invlist_array(invlist);
8188
8189     /* Find which element it is */
8190     i = _invlist_search(invlist, start);
8191
8192     /* We populate from <start> to <end> */
8193     while (current < end) {
8194         UV upper;
8195
8196         /* The inversion list gives the results for every possible code point
8197          * after the first one in the list.  Only those ranges whose index is
8198          * even are ones that the inversion list matches.  For the odd ones,
8199          * and if the initial code point is not in the list, we have to skip
8200          * forward to the next element */
8201         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8202             i++;
8203             if (i >= len) { /* Finished if beyond the end of the array */
8204                 return;
8205             }
8206             current = array[i];
8207             if (current >= end) {   /* Finished if beyond the end of what we
8208                                        are populating */
8209                 if (LIKELY(end < UV_MAX)) {
8210                     return;
8211                 }
8212
8213                 /* We get here when the upper bound is the maximum
8214                  * representable on the machine, and we are looking for just
8215                  * that code point.  Have to special case it */
8216                 i = len;
8217                 goto join_end_of_list;
8218             }
8219         }
8220         assert(current >= start);
8221
8222         /* The current range ends one below the next one, except don't go past
8223          * <end> */
8224         i++;
8225         upper = (i < len && array[i] < end) ? array[i] : end;
8226
8227         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8228          * for each code point in it */
8229         for (; current < upper; current++) {
8230             const STRLEN offset = (STRLEN)(current - start);
8231             swatch[offset >> 3] |= 1 << (offset & 7);
8232         }
8233
8234     join_end_of_list:
8235
8236         /* Quit if at the end of the list */
8237         if (i >= len) {
8238
8239             /* But first, have to deal with the highest possible code point on
8240              * the platform.  The previous code assumes that <end> is one
8241              * beyond where we want to populate, but that is impossible at the
8242              * platform's infinity, so have to handle it specially */
8243             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8244             {
8245                 const STRLEN offset = (STRLEN)(end - start);
8246                 swatch[offset >> 3] |= 1 << (offset & 7);
8247             }
8248             return;
8249         }
8250
8251         /* Advance to the next range, which will be for code points not in the
8252          * inversion list */
8253         current = array[i];
8254     }
8255
8256     return;
8257 }
8258
8259 void
8260 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8261                                          const bool complement_b, SV** output)
8262 {
8263     /* Take the union of two inversion lists and point <output> to it.  *output
8264      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8265      * the reference count to that list will be decremented if not already a
8266      * temporary (mortal); otherwise *output will be made correspondingly
8267      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8268      * second list is returned.  If <complement_b> is TRUE, the union is taken
8269      * of the complement (inversion) of <b> instead of b itself.
8270      *
8271      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8272      * Richard Gillam, published by Addison-Wesley, and explained at some
8273      * length there.  The preface says to incorporate its examples into your
8274      * code at your own risk.
8275      *
8276      * The algorithm is like a merge sort.
8277      *
8278      * XXX A potential performance improvement is to keep track as we go along
8279      * if only one of the inputs contributes to the result, meaning the other
8280      * is a subset of that one.  In that case, we can skip the final copy and
8281      * return the larger of the input lists, but then outside code might need
8282      * to keep track of whether to free the input list or not */
8283
8284     const UV* array_a;    /* a's array */
8285     const UV* array_b;
8286     UV len_a;       /* length of a's array */
8287     UV len_b;
8288
8289     SV* u;                      /* the resulting union */
8290     UV* array_u;
8291     UV len_u;
8292
8293     UV i_a = 0;             /* current index into a's array */
8294     UV i_b = 0;
8295     UV i_u = 0;
8296
8297     /* running count, as explained in the algorithm source book; items are
8298      * stopped accumulating and are output when the count changes to/from 0.
8299      * The count is incremented when we start a range that's in the set, and
8300      * decremented when we start a range that's not in the set.  So its range
8301      * is 0 to 2.  Only when the count is zero is something not in the set.
8302      */
8303     UV count = 0;
8304
8305     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8306     assert(a != b);
8307
8308     /* If either one is empty, the union is the other one */
8309     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8310         bool make_temp = FALSE; /* Should we mortalize the result? */
8311
8312         if (*output == a) {
8313             if (a != NULL) {
8314                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8315                     SvREFCNT_dec_NN(a);
8316                 }
8317             }
8318         }
8319         if (*output != b) {
8320             *output = invlist_clone(b);
8321             if (complement_b) {
8322                 _invlist_invert(*output);
8323             }
8324         } /* else *output already = b; */
8325
8326         if (make_temp) {
8327             sv_2mortal(*output);
8328         }
8329         return;
8330     }
8331     else if ((len_b = _invlist_len(b)) == 0) {
8332         bool make_temp = FALSE;
8333         if (*output == b) {
8334             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8335                 SvREFCNT_dec_NN(b);
8336             }
8337         }
8338
8339         /* The complement of an empty list is a list that has everything in it,
8340          * so the union with <a> includes everything too */
8341         if (complement_b) {
8342             if (a == *output) {
8343                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8344                     SvREFCNT_dec_NN(a);
8345                 }
8346             }
8347             *output = _new_invlist(1);
8348             _append_range_to_invlist(*output, 0, UV_MAX);
8349         }
8350         else if (*output != a) {
8351             *output = invlist_clone(a);
8352         }
8353         /* else *output already = a; */
8354
8355         if (make_temp) {
8356             sv_2mortal(*output);
8357         }
8358         return;
8359     }
8360
8361     /* Here both lists exist and are non-empty */
8362     array_a = invlist_array(a);
8363     array_b = invlist_array(b);
8364
8365     /* If are to take the union of 'a' with the complement of b, set it
8366      * up so are looking at b's complement. */
8367     if (complement_b) {
8368
8369         /* To complement, we invert: if the first element is 0, remove it.  To
8370          * do this, we just pretend the array starts one later */
8371         if (array_b[0] == 0) {
8372             array_b++;
8373             len_b--;
8374         }
8375         else {
8376
8377             /* But if the first element is not zero, we pretend the list starts
8378              * at the 0 that is always stored immediately before the array. */
8379             array_b--;
8380             len_b++;
8381         }
8382     }
8383
8384     /* Size the union for the worst case: that the sets are completely
8385      * disjoint */
8386     u = _new_invlist(len_a + len_b);
8387
8388     /* Will contain U+0000 if either component does */
8389     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8390                                       || (len_b > 0 && array_b[0] == 0));
8391
8392     /* Go through each list item by item, stopping when exhausted one of
8393      * them */
8394     while (i_a < len_a && i_b < len_b) {
8395         UV cp;      /* The element to potentially add to the union's array */
8396         bool cp_in_set;   /* is it in the the input list's set or not */
8397
8398         /* We need to take one or the other of the two inputs for the union.
8399          * Since we are merging two sorted lists, we take the smaller of the
8400          * next items.  In case of a tie, we take the one that is in its set
8401          * first.  If we took one not in the set first, it would decrement the
8402          * count, possibly to 0 which would cause it to be output as ending the
8403          * range, and the next time through we would take the same number, and
8404          * output it again as beginning the next range.  By doing it the
8405          * opposite way, there is no possibility that the count will be
8406          * momentarily decremented to 0, and thus the two adjoining ranges will
8407          * be seamlessly merged.  (In a tie and both are in the set or both not
8408          * in the set, it doesn't matter which we take first.) */
8409         if (array_a[i_a] < array_b[i_b]
8410             || (array_a[i_a] == array_b[i_b]
8411                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8412         {
8413             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8414             cp= array_a[i_a++];
8415         }
8416         else {
8417             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8418             cp = array_b[i_b++];
8419         }
8420
8421         /* Here, have chosen which of the two inputs to look at.  Only output
8422          * if the running count changes to/from 0, which marks the
8423          * beginning/end of a range in that's in the set */
8424         if (cp_in_set) {
8425             if (count == 0) {
8426                 array_u[i_u++] = cp;
8427             }
8428             count++;
8429         }
8430         else {
8431             count--;
8432             if (count == 0) {
8433                 array_u[i_u++] = cp;
8434             }
8435         }
8436     }
8437
8438     /* Here, we are finished going through at least one of the lists, which
8439      * means there is something remaining in at most one.  We check if the list
8440      * that hasn't been exhausted is positioned such that we are in the middle
8441      * of a range in its set or not.  (i_a and i_b point to the element beyond
8442      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8443      * is potentially more to output.
8444      * There are four cases:
8445      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8446      *     in the union is entirely from the non-exhausted set.
8447      *  2) Both were in their sets, count is 2.  Nothing further should
8448      *     be output, as everything that remains will be in the exhausted
8449      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8450      *     that
8451      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8452      *     Nothing further should be output because the union includes
8453      *     everything from the exhausted set.  Not decrementing ensures that.
8454      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8455      *     decrementing to 0 insures that we look at the remainder of the
8456      *     non-exhausted set */
8457     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8458         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8459     {
8460         count--;
8461     }
8462
8463     /* The final length is what we've output so far, plus what else is about to
8464      * be output.  (If 'count' is non-zero, then the input list we exhausted
8465      * has everything remaining up to the machine's limit in its set, and hence
8466      * in the union, so there will be no further output. */
8467     len_u = i_u;
8468     if (count == 0) {
8469         /* At most one of the subexpressions will be non-zero */
8470         len_u += (len_a - i_a) + (len_b - i_b);
8471     }
8472
8473     /* Set result to final length, which can change the pointer to array_u, so
8474      * re-find it */
8475     if (len_u != _invlist_len(u)) {
8476         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8477         invlist_trim(u);
8478         array_u = invlist_array(u);
8479     }
8480
8481     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8482      * the other) ended with everything above it not in its set.  That means
8483      * that the remaining part of the union is precisely the same as the
8484      * non-exhausted list, so can just copy it unchanged.  (If both list were
8485      * exhausted at the same time, then the operations below will be both 0.)
8486      */
8487     if (count == 0) {
8488         IV copy_count; /* At most one will have a non-zero copy count */
8489         if ((copy_count = len_a - i_a) > 0) {
8490             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8491         }
8492         else if ((copy_count = len_b - i_b) > 0) {
8493             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8494         }
8495     }
8496
8497     /*  We may be removing a reference to one of the inputs.  If so, the output
8498      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8499      *  count decremented) */
8500     if (a == *output || b == *output) {
8501         assert(! invlist_is_iterating(*output));
8502         if ((SvTEMP(*output))) {
8503             sv_2mortal(u);
8504         }
8505         else {
8506             SvREFCNT_dec_NN(*output);
8507         }
8508     }
8509
8510     *output = u;
8511
8512     return;
8513 }
8514
8515 void
8516 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8517                                                const bool complement_b, SV** i)
8518 {
8519     /* Take the intersection of two inversion lists and point <i> to it.  *i
8520      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8521      * the reference count to that list will be decremented if not already a
8522      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8523      * The first list, <a>, may be NULL, in which case an empty list is
8524      * returned.  If <complement_b> is TRUE, the result will be the
8525      * intersection of <a> and the complement (or inversion) of <b> instead of
8526      * <b> directly.
8527      *
8528      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8529      * Richard Gillam, published by Addison-Wesley, and explained at some
8530      * length there.  The preface says to incorporate its examples into your
8531      * code at your own risk.  In fact, it had bugs
8532      *
8533      * The algorithm is like a merge sort, and is essentially the same as the
8534      * union above
8535      */
8536
8537     const UV* array_a;          /* a's array */
8538     const UV* array_b;
8539     UV len_a;   /* length of a's array */
8540     UV len_b;
8541
8542     SV* r;                   /* the resulting intersection */
8543     UV* array_r;
8544     UV len_r;
8545
8546     UV i_a = 0;             /* current index into a's array */
8547     UV i_b = 0;
8548     UV i_r = 0;
8549
8550     /* running count, as explained in the algorithm source book; items are
8551      * stopped accumulating and are output when the count changes to/from 2.
8552      * The count is incremented when we start a range that's in the set, and
8553      * decremented when we start a range that's not in the set.  So its range
8554      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8555      */
8556     UV count = 0;
8557
8558     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8559     assert(a != b);
8560
8561     /* Special case if either one is empty */
8562     len_a = (a == NULL) ? 0 : _invlist_len(a);
8563     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8564         bool make_temp = FALSE;
8565
8566         if (len_a != 0 && complement_b) {
8567
8568             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8569              * be empty.  Here, also we are using 'b's complement, which hence
8570              * must be every possible code point.  Thus the intersection is
8571              * simply 'a'. */
8572             if (*i != a) {
8573                 if (*i == b) {
8574                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8575                         SvREFCNT_dec_NN(b);
8576                     }
8577                 }
8578
8579                 *i = invlist_clone(a);
8580             }
8581             /* else *i is already 'a' */
8582
8583             if (make_temp) {
8584                 sv_2mortal(*i);
8585             }
8586             return;
8587         }
8588
8589         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8590          * intersection must be empty */
8591         if (*i == a) {
8592             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8593                 SvREFCNT_dec_NN(a);
8594             }
8595         }
8596         else if (*i == b) {
8597             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8598                 SvREFCNT_dec_NN(b);
8599             }
8600         }
8601         *i = _new_invlist(0);
8602         if (make_temp) {
8603             sv_2mortal(*i);
8604         }
8605
8606         return;
8607     }
8608
8609     /* Here both lists exist and are non-empty */
8610     array_a = invlist_array(a);
8611     array_b = invlist_array(b);
8612
8613     /* If are to take the intersection of 'a' with the complement of b, set it
8614      * up so are looking at b's complement. */
8615     if (complement_b) {
8616
8617         /* To complement, we invert: if the first element is 0, remove it.  To
8618          * do this, we just pretend the array starts one later */
8619         if (array_b[0] == 0) {
8620             array_b++;
8621             len_b--;
8622         }
8623         else {
8624
8625             /* But if the first element is not zero, we pretend the list starts
8626              * at the 0 that is always stored immediately before the array. */
8627             array_b--;
8628             len_b++;
8629         }
8630     }
8631
8632     /* Size the intersection for the worst case: that the intersection ends up
8633      * fragmenting everything to be completely disjoint */
8634     r= _new_invlist(len_a + len_b);
8635
8636     /* Will contain U+0000 iff both components do */
8637     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8638                                      && len_b > 0 && array_b[0] == 0);
8639
8640     /* Go through each list item by item, stopping when exhausted one of
8641      * them */
8642     while (i_a < len_a && i_b < len_b) {
8643         UV cp;      /* The element to potentially add to the intersection's
8644                        array */
8645         bool cp_in_set; /* Is it in the input list's set or not */
8646
8647         /* We need to take one or the other of the two inputs for the
8648          * intersection.  Since we are merging two sorted lists, we take the
8649          * smaller of the next items.  In case of a tie, we take the one that
8650          * is not in its set first (a difference from the union algorithm).  If
8651          * we took one in the set first, it would increment the count, possibly
8652          * to 2 which would cause it to be output as starting a range in the
8653          * intersection, and the next time through we would take that same
8654          * number, and output it again as ending the set.  By doing it the
8655          * opposite of this, there is no possibility that the count will be
8656          * momentarily incremented to 2.  (In a tie and both are in the set or
8657          * both not in the set, it doesn't matter which we take first.) */
8658         if (array_a[i_a] < array_b[i_b]
8659             || (array_a[i_a] == array_b[i_b]
8660                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8661         {
8662             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8663             cp= array_a[i_a++];
8664         }
8665         else {
8666             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8667             cp= array_b[i_b++];
8668         }
8669
8670         /* Here, have chosen which of the two inputs to look at.  Only output
8671          * if the running count changes to/from 2, which marks the
8672          * beginning/end of a range that's in the intersection */
8673         if (cp_in_set) {
8674             count++;
8675             if (count == 2) {
8676                 array_r[i_r++] = cp;
8677             }
8678         }
8679         else {
8680             if (count == 2) {
8681                 array_r[i_r++] = cp;
8682             }
8683             count--;
8684         }
8685     }
8686
8687     /* Here, we are finished going through at least one of the lists, which
8688      * means there is something remaining in at most one.  We check if the list
8689      * that has been exhausted is positioned such that we are in the middle
8690      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8691      * the ones we care about.)  There are four cases:
8692      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8693      *     nothing left in the intersection.
8694      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8695      *     above 2.  What should be output is exactly that which is in the
8696      *     non-exhausted set, as everything it has is also in the intersection
8697      *     set, and everything it doesn't have can't be in the intersection
8698      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8699      *     gets incremented to 2.  Like the previous case, the intersection is
8700      *     everything that remains in the non-exhausted set.
8701      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8702      *     remains 1.  And the intersection has nothing more. */
8703     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8704         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8705     {
8706         count++;
8707     }
8708
8709     /* The final length is what we've output so far plus what else is in the
8710      * intersection.  At most one of the subexpressions below will be non-zero
8711      * */
8712     len_r = i_r;
8713     if (count >= 2) {
8714         len_r += (len_a - i_a) + (len_b - i_b);
8715     }
8716
8717     /* Set result to final length, which can change the pointer to array_r, so
8718      * re-find it */
8719     if (len_r != _invlist_len(r)) {
8720         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8721         invlist_trim(r);
8722         array_r = invlist_array(r);
8723     }
8724
8725     /* Finish outputting any remaining */
8726     if (count >= 2) { /* At most one will have a non-zero copy count */
8727         IV copy_count;
8728         if ((copy_count = len_a - i_a) > 0) {
8729             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8730         }
8731         else if ((copy_count = len_b - i_b) > 0) {
8732             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8733         }
8734     }
8735
8736     /*  We may be removing a reference to one of the inputs.  If so, the output
8737      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8738      *  count decremented) */
8739     if (a == *i || b == *i) {
8740         assert(! invlist_is_iterating(*i));
8741         if (SvTEMP(*i)) {
8742             sv_2mortal(r);
8743         }
8744         else {
8745             SvREFCNT_dec_NN(*i);
8746         }
8747     }
8748
8749     *i = r;
8750
8751     return;
8752 }
8753
8754 SV*
8755 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8756 {
8757     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8758      * set.  A pointer to the inversion list is returned.  This may actually be
8759      * a new list, in which case the passed in one has been destroyed.  The
8760      * passed in inversion list can be NULL, in which case a new one is created
8761      * with just the one range in it */
8762
8763     SV* range_invlist;
8764     UV len;
8765
8766     if (invlist == NULL) {
8767         invlist = _new_invlist(2);
8768         len = 0;
8769     }
8770     else {
8771         len = _invlist_len(invlist);
8772     }
8773
8774     /* If comes after the final entry actually in the list, can just append it
8775      * to the end, */
8776     if (len == 0
8777         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8778             && start >= invlist_array(invlist)[len - 1]))
8779     {
8780         _append_range_to_invlist(invlist, start, end);
8781         return invlist;
8782     }
8783
8784     /* Here, can't just append things, create and return a new inversion list
8785      * which is the union of this range and the existing inversion list */
8786     range_invlist = _new_invlist(2);
8787     _append_range_to_invlist(range_invlist, start, end);
8788
8789     _invlist_union(invlist, range_invlist, &invlist);
8790
8791     /* The temporary can be freed */
8792     SvREFCNT_dec_NN(range_invlist);
8793
8794     return invlist;
8795 }
8796
8797 SV*
8798 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8799                                  UV** other_elements_ptr)
8800 {
8801     /* Create and return an inversion list whose contents are to be populated
8802      * by the caller.  The caller gives the number of elements (in 'size') and
8803      * the very first element ('element0').  This function will set
8804      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8805      * are to be placed.
8806      *
8807      * Obviously there is some trust involved that the caller will properly
8808      * fill in the other elements of the array.
8809      *
8810      * (The first element needs to be passed in, as the underlying code does
8811      * things differently depending on whether it is zero or non-zero) */
8812
8813     SV* invlist = _new_invlist(size);
8814     bool offset;
8815
8816     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8817
8818     _append_range_to_invlist(invlist, element0, element0);
8819     offset = *get_invlist_offset_addr(invlist);
8820
8821     invlist_set_len(invlist, size, offset);
8822     *other_elements_ptr = invlist_array(invlist) + 1;
8823     return invlist;
8824 }
8825
8826 #endif
8827
8828 PERL_STATIC_INLINE SV*
8829 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8830     return _add_range_to_invlist(invlist, cp, cp);
8831 }
8832
8833 #ifndef PERL_IN_XSUB_RE
8834 void
8835 Perl__invlist_invert(pTHX_ SV* const invlist)
8836 {
8837     /* Complement the input inversion list.  This adds a 0 if the list didn't
8838      * have a zero; removes it otherwise.  As described above, the data
8839      * structure is set up so that this is very efficient */
8840
8841     PERL_ARGS_ASSERT__INVLIST_INVERT;
8842
8843     assert(! invlist_is_iterating(invlist));
8844
8845     /* The inverse of matching nothing is matching everything */
8846     if (_invlist_len(invlist) == 0) {
8847         _append_range_to_invlist(invlist, 0, UV_MAX);
8848         return;
8849     }
8850
8851     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8852 }
8853
8854 #endif
8855
8856 PERL_STATIC_INLINE SV*
8857 S_invlist_clone(pTHX_ SV* const invlist)
8858 {
8859
8860     /* Return a new inversion list that is a copy of the input one, which is
8861      * unchanged.  The new list will not be mortal even if the old one was. */
8862
8863     /* Need to allocate extra space to accommodate Perl's addition of a
8864      * trailing NUL to SvPV's, since it thinks they are always strings */
8865     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8866     STRLEN physical_length = SvCUR(invlist);
8867     bool offset = *(get_invlist_offset_addr(invlist));
8868
8869     PERL_ARGS_ASSERT_INVLIST_CLONE;
8870
8871     *(get_invlist_offset_addr(new_invlist)) = offset;
8872     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8873     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8874
8875     return new_invlist;
8876 }
8877
8878 PERL_STATIC_INLINE STRLEN*
8879 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8880 {
8881     /* Return the address of the UV that contains the current iteration
8882      * position */
8883
8884     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8885
8886     assert(SvTYPE(invlist) == SVt_INVLIST);
8887
8888     return &(((XINVLIST*) SvANY(invlist))->iterator);
8889 }
8890
8891 PERL_STATIC_INLINE void
8892 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8893 {
8894     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8895
8896     *get_invlist_iter_addr(invlist) = 0;
8897 }
8898
8899 PERL_STATIC_INLINE void
8900 S_invlist_iterfinish(pTHX_ SV* invlist)
8901 {
8902     /* Terminate iterator for invlist.  This is to catch development errors.
8903      * Any iteration that is interrupted before completed should call this
8904      * function.  Functions that add code points anywhere else but to the end
8905      * of an inversion list assert that they are not in the middle of an
8906      * iteration.  If they were, the addition would make the iteration
8907      * problematical: if the iteration hadn't reached the place where things
8908      * were being added, it would be ok */
8909
8910     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8911
8912     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8913 }
8914
8915 STATIC bool
8916 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8917 {
8918     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8919      * This call sets in <*start> and <*end>, the next range in <invlist>.
8920      * Returns <TRUE> if successful and the next call will return the next
8921      * range; <FALSE> if was already at the end of the list.  If the latter,
8922      * <*start> and <*end> are unchanged, and the next call to this function
8923      * will start over at the beginning of the list */
8924
8925     STRLEN* pos = get_invlist_iter_addr(invlist);
8926     UV len = _invlist_len(invlist);
8927     UV *array;
8928
8929     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8930
8931     if (*pos >= len) {
8932         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8933         return FALSE;
8934     }
8935
8936     array = invlist_array(invlist);
8937
8938     *start = array[(*pos)++];
8939
8940     if (*pos >= len) {
8941         *end = UV_MAX;
8942     }
8943     else {
8944         *end = array[(*pos)++] - 1;
8945     }
8946
8947     return TRUE;
8948 }
8949
8950 PERL_STATIC_INLINE bool
8951 S_invlist_is_iterating(pTHX_ SV* const invlist)
8952 {
8953     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8954
8955     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8956 }
8957
8958 PERL_STATIC_INLINE UV
8959 S_invlist_highest(pTHX_ SV* const invlist)
8960 {
8961     /* Returns the highest code point that matches an inversion list.  This API
8962      * has an ambiguity, as it returns 0 under either the highest is actually
8963      * 0, or if the list is empty.  If this distinction matters to you, check
8964      * for emptiness before calling this function */
8965
8966     UV len = _invlist_len(invlist);
8967     UV *array;
8968
8969     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8970
8971     if (len == 0) {
8972         return 0;
8973     }
8974
8975     array = invlist_array(invlist);
8976
8977     /* The last element in the array in the inversion list always starts a
8978      * range that goes to infinity.  That range may be for code points that are
8979      * matched in the inversion list, or it may be for ones that aren't
8980      * matched.  In the latter case, the highest code point in the set is one
8981      * less than the beginning of this range; otherwise it is the final element
8982      * of this range: infinity */
8983     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8984            ? UV_MAX
8985            : array[len - 1] - 1;
8986 }
8987
8988 #ifndef PERL_IN_XSUB_RE
8989 SV *
8990 Perl__invlist_contents(pTHX_ SV* const invlist)
8991 {
8992     /* Get the contents of an inversion list into a string SV so that they can
8993      * be printed out.  It uses the format traditionally done for debug tracing
8994      */
8995
8996     UV start, end;
8997     SV* output = newSVpvs("\n");
8998
8999     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9000
9001     assert(! invlist_is_iterating(invlist));
9002
9003     invlist_iterinit(invlist);
9004     while (invlist_iternext(invlist, &start, &end)) {
9005         if (end == UV_MAX) {
9006             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9007         }
9008         else if (end != start) {
9009             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9010                     start,       end);
9011         }
9012         else {
9013             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9014         }
9015     }
9016
9017     return output;
9018 }
9019 #endif
9020
9021 #ifndef PERL_IN_XSUB_RE
9022 void
9023 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9024                          const char * const indent, SV* const invlist)
9025 {
9026     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9027      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9028      * the string 'indent'.  The output looks like this:
9029          [0] 0x000A .. 0x000D
9030          [2] 0x0085
9031          [4] 0x2028 .. 0x2029
9032          [6] 0x3104 .. INFINITY
9033      * This means that the first range of code points matched by the list are
9034      * 0xA through 0xD; the second range contains only the single code point
9035      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9036      * are used to define each range (except if the final range extends to
9037      * infinity, only a single element is needed).  The array index of the
9038      * first element for the corresponding range is given in brackets. */
9039
9040     UV start, end;
9041     STRLEN count = 0;
9042
9043     PERL_ARGS_ASSERT__INVLIST_DUMP;
9044
9045     if (invlist_is_iterating(invlist)) {
9046         Perl_dump_indent(aTHX_ level, file,
9047              "%sCan't dump inversion list because is in middle of iterating\n",
9048              indent);
9049         return;
9050     }
9051
9052     invlist_iterinit(invlist);
9053     while (invlist_iternext(invlist, &start, &end)) {
9054         if (end == UV_MAX) {
9055             Perl_dump_indent(aTHX_ level, file,
9056                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9057                                    indent, (UV)count, start);
9058         }
9059         else if (end != start) {
9060             Perl_dump_indent(aTHX_ level, file,
9061                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9062                                 indent, (UV)count, start,         end);
9063         }
9064         else {
9065             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9066                                             indent, (UV)count, start);
9067         }
9068         count += 2;
9069     }
9070 }
9071 #endif
9072
9073 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9074 bool
9075 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9076 {
9077     /* Return a boolean as to if the two passed in inversion lists are
9078      * identical.  The final argument, if TRUE, says to take the complement of
9079      * the second inversion list before doing the comparison */
9080
9081     const UV* array_a = invlist_array(a);
9082     const UV* array_b = invlist_array(b);
9083     UV len_a = _invlist_len(a);
9084     UV len_b = _invlist_len(b);
9085
9086     UV i = 0;               /* current index into the arrays */
9087     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9088
9089     PERL_ARGS_ASSERT__INVLISTEQ;
9090
9091     /* If are to compare 'a' with the complement of b, set it
9092      * up so are looking at b's complement. */
9093     if (complement_b) {
9094
9095         /* The complement of nothing is everything, so <a> would have to have
9096          * just one element, starting at zero (ending at infinity) */
9097         if (len_b == 0) {
9098             return (len_a == 1 && array_a[0] == 0);
9099         }
9100         else if (array_b[0] == 0) {
9101
9102             /* Otherwise, to complement, we invert.  Here, the first element is
9103              * 0, just remove it.  To do this, we just pretend the array starts
9104              * one later */
9105
9106             array_b++;
9107             len_b--;
9108         }
9109         else {
9110
9111             /* But if the first element is not zero, we pretend the list starts
9112              * at the 0 that is always stored immediately before the array. */
9113             array_b--;
9114             len_b++;
9115         }
9116     }
9117
9118     /* Make sure that the lengths are the same, as well as the final element
9119      * before looping through the remainder.  (Thus we test the length, final,
9120      * and first elements right off the bat) */
9121     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9122         retval = FALSE;
9123     }
9124     else for (i = 0; i < len_a - 1; i++) {
9125         if (array_a[i] != array_b[i]) {
9126             retval = FALSE;
9127             break;
9128         }
9129     }
9130
9131     return retval;
9132 }
9133 #endif
9134
9135 #undef HEADER_LENGTH
9136 #undef TO_INTERNAL_SIZE
9137 #undef FROM_INTERNAL_SIZE
9138 #undef INVLIST_VERSION_ID
9139
9140 /* End of inversion list object */
9141
9142 STATIC void
9143 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9144 {
9145     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9146      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9147      * should point to the first flag; it is updated on output to point to the
9148      * final ')' or ':'.  There needs to be at least one flag, or this will
9149      * abort */
9150
9151     /* for (?g), (?gc), and (?o) warnings; warning
9152        about (?c) will warn about (?g) -- japhy    */
9153
9154 #define WASTED_O  0x01
9155 #define WASTED_G  0x02
9156 #define WASTED_C  0x04
9157 #define WASTED_GC (WASTED_G|WASTED_C)
9158     I32 wastedflags = 0x00;
9159     U32 posflags = 0, negflags = 0;
9160     U32 *flagsp = &posflags;
9161     char has_charset_modifier = '\0';
9162     regex_charset cs;
9163     bool has_use_defaults = FALSE;
9164     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9165
9166     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9167
9168     /* '^' as an initial flag sets certain defaults */
9169     if (UCHARAT(RExC_parse) == '^') {
9170         RExC_parse++;
9171         has_use_defaults = TRUE;
9172         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9173         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9174                                         ? REGEX_UNICODE_CHARSET
9175                                         : REGEX_DEPENDS_CHARSET);
9176     }
9177
9178     cs = get_regex_charset(RExC_flags);
9179     if (cs == REGEX_DEPENDS_CHARSET
9180         && (RExC_utf8 || RExC_uni_semantics))
9181     {
9182         cs = REGEX_UNICODE_CHARSET;
9183     }
9184
9185     while (*RExC_parse) {
9186         /* && strchr("iogcmsx", *RExC_parse) */
9187         /* (?g), (?gc) and (?o) are useless here
9188            and must be globally applied -- japhy */
9189         switch (*RExC_parse) {
9190
9191             /* Code for the imsx flags */
9192             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9193
9194             case LOCALE_PAT_MOD:
9195                 if (has_charset_modifier) {
9196                     goto excess_modifier;
9197                 }
9198                 else if (flagsp == &negflags) {
9199                     goto neg_modifier;
9200                 }
9201                 cs = REGEX_LOCALE_CHARSET;
9202                 has_charset_modifier = LOCALE_PAT_MOD;
9203                 break;
9204             case UNICODE_PAT_MOD:
9205                 if (has_charset_modifier) {
9206                     goto excess_modifier;
9207                 }
9208                 else if (flagsp == &negflags) {
9209                     goto neg_modifier;
9210                 }
9211                 cs = REGEX_UNICODE_CHARSET;
9212                 has_charset_modifier = UNICODE_PAT_MOD;
9213                 break;
9214             case ASCII_RESTRICT_PAT_MOD:
9215                 if (flagsp == &negflags) {
9216                     goto neg_modifier;
9217                 }
9218                 if (has_charset_modifier) {
9219                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9220                         goto excess_modifier;
9221                     }
9222                     /* Doubled modifier implies more restricted */
9223                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9224                 }
9225                 else {
9226                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9227                 }
9228                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9229                 break;
9230             case DEPENDS_PAT_MOD:
9231                 if (has_use_defaults) {
9232                     goto fail_modifiers;
9233                 }
9234                 else if (flagsp == &negflags) {
9235                     goto neg_modifier;
9236                 }
9237                 else if (has_charset_modifier) {
9238                     goto excess_modifier;
9239                 }
9240
9241                 /* The dual charset means unicode semantics if the
9242                  * pattern (or target, not known until runtime) are
9243                  * utf8, or something in the pattern indicates unicode
9244                  * semantics */
9245                 cs = (RExC_utf8 || RExC_uni_semantics)
9246                      ? REGEX_UNICODE_CHARSET
9247                      : REGEX_DEPENDS_CHARSET;
9248                 has_charset_modifier = DEPENDS_PAT_MOD;
9249                 break;
9250             excess_modifier:
9251                 RExC_parse++;
9252                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9253                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9254                 }
9255                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9256                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9257                                         *(RExC_parse - 1));
9258                 }
9259                 else {
9260                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9261                 }
9262                 /*NOTREACHED*/
9263             neg_modifier:
9264                 RExC_parse++;
9265                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9266                                     *(RExC_parse - 1));
9267                 /*NOTREACHED*/
9268             case ONCE_PAT_MOD: /* 'o' */
9269             case GLOBAL_PAT_MOD: /* 'g' */
9270                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9271                     const I32 wflagbit = *RExC_parse == 'o'
9272                                          ? WASTED_O
9273                                          : WASTED_G;
9274                     if (! (wastedflags & wflagbit) ) {
9275                         wastedflags |= wflagbit;
9276                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9277                         vWARN5(
9278                             RExC_parse + 1,
9279                             "Useless (%s%c) - %suse /%c modifier",
9280                             flagsp == &negflags ? "?-" : "?",
9281                             *RExC_parse,
9282                             flagsp == &negflags ? "don't " : "",
9283                             *RExC_parse
9284                         );
9285                     }
9286                 }
9287                 break;
9288
9289             case CONTINUE_PAT_MOD: /* 'c' */
9290                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9291                     if (! (wastedflags & WASTED_C) ) {
9292                         wastedflags |= WASTED_GC;
9293                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9294                         vWARN3(
9295                             RExC_parse + 1,
9296                             "Useless (%sc) - %suse /gc modifier",
9297                             flagsp == &negflags ? "?-" : "?",
9298                             flagsp == &negflags ? "don't " : ""
9299                         );
9300                     }
9301                 }
9302                 break;
9303             case KEEPCOPY_PAT_MOD: /* 'p' */
9304                 if (flagsp == &negflags) {
9305                     if (SIZE_ONLY)
9306                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9307                 } else {
9308                     *flagsp |= RXf_PMf_KEEPCOPY;
9309                 }
9310                 break;
9311             case '-':
9312                 /* A flag is a default iff it is following a minus, so
9313                  * if there is a minus, it means will be trying to
9314                  * re-specify a default which is an error */
9315                 if (has_use_defaults || flagsp == &negflags) {
9316                     goto fail_modifiers;
9317                 }
9318                 flagsp = &negflags;
9319                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9320                 break;
9321             case ':':
9322             case ')':
9323                 RExC_flags |= posflags;
9324                 RExC_flags &= ~negflags;
9325                 set_regex_charset(&RExC_flags, cs);
9326                 if (RExC_flags & RXf_PMf_FOLD) {
9327                     RExC_contains_i = 1;
9328                 }
9329                 return;
9330                 /*NOTREACHED*/
9331             default:
9332             fail_modifiers:
9333                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9334                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9335                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9336                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9337                 /*NOTREACHED*/
9338         }
9339
9340         ++RExC_parse;
9341     }
9342 }
9343
9344 /*
9345  - reg - regular expression, i.e. main body or parenthesized thing
9346  *
9347  * Caller must absorb opening parenthesis.
9348  *
9349  * Combining parenthesis handling with the base level of regular expression
9350  * is a trifle forced, but the need to tie the tails of the branches to what
9351  * follows makes it hard to avoid.
9352  */
9353 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9354 #ifdef DEBUGGING
9355 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9356 #else
9357 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9358 #endif
9359
9360 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9361    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9362    needs to be restarted.
9363    Otherwise would only return NULL if regbranch() returns NULL, which
9364    cannot happen.  */
9365 STATIC regnode *
9366 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9367     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9368      * 2 is like 1, but indicates that nextchar() has been called to advance
9369      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9370      * this flag alerts us to the need to check for that */
9371 {
9372     dVAR;
9373     regnode *ret;               /* Will be the head of the group. */
9374     regnode *br;
9375     regnode *lastbr;
9376     regnode *ender = NULL;
9377     I32 parno = 0;
9378     I32 flags;
9379     U32 oregflags = RExC_flags;
9380     bool have_branch = 0;
9381     bool is_open = 0;
9382     I32 freeze_paren = 0;
9383     I32 after_freeze = 0;
9384
9385     char * parse_start = RExC_parse; /* MJD */
9386     char * const oregcomp_parse = RExC_parse;
9387
9388     GET_RE_DEBUG_FLAGS_DECL;
9389
9390     PERL_ARGS_ASSERT_REG;
9391     DEBUG_PARSE("reg ");
9392
9393     *flagp = 0;                         /* Tentatively. */
9394
9395
9396     /* Make an OPEN node, if parenthesized. */
9397     if (paren) {
9398
9399         /* Under /x, space and comments can be gobbled up between the '(' and
9400          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9401          * intervening space, as the sequence is a token, and a token should be
9402          * indivisible */
9403         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9404
9405         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9406             char *start_verb = RExC_parse;
9407             STRLEN verb_len = 0;
9408             char *start_arg = NULL;
9409             unsigned char op = 0;
9410             int argok = 1;
9411             int internal_argval = 0; /* internal_argval is only useful if
9412                                         !argok */
9413
9414             if (has_intervening_patws && SIZE_ONLY) {
9415                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9416             }
9417             while ( *RExC_parse && *RExC_parse != ')' ) {
9418                 if ( *RExC_parse == ':' ) {
9419                     start_arg = RExC_parse + 1;
9420                     break;
9421                 }
9422                 RExC_parse++;
9423             }
9424             ++start_verb;
9425             verb_len = RExC_parse - start_verb;
9426             if ( start_arg ) {
9427                 RExC_parse++;
9428                 while ( *RExC_parse && *RExC_parse != ')' )
9429                     RExC_parse++;
9430                 if ( *RExC_parse != ')' )
9431                     vFAIL("Unterminated verb pattern argument");
9432                 if ( RExC_parse == start_arg )
9433                     start_arg = NULL;
9434             } else {
9435                 if ( *RExC_parse != ')' )
9436                     vFAIL("Unterminated verb pattern");
9437             }
9438
9439             switch ( *start_verb ) {
9440             case 'A':  /* (*ACCEPT) */
9441                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9442                     op = ACCEPT;
9443                     internal_argval = RExC_nestroot;
9444                 }
9445                 break;
9446             case 'C':  /* (*COMMIT) */
9447                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9448                     op = COMMIT;
9449                 break;
9450             case 'F':  /* (*FAIL) */
9451                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9452                     op = OPFAIL;
9453                     argok = 0;
9454                 }
9455                 break;
9456             case ':':  /* (*:NAME) */
9457             case 'M':  /* (*MARK:NAME) */
9458                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9459                     op = MARKPOINT;
9460                     argok = -1;
9461                 }
9462                 break;
9463             case 'P':  /* (*PRUNE) */
9464                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9465                     op = PRUNE;
9466                 break;
9467             case 'S':   /* (*SKIP) */
9468                 if ( memEQs(start_verb,verb_len,"SKIP") )
9469                     op = SKIP;
9470                 break;
9471             case 'T':  /* (*THEN) */
9472                 /* [19:06] <TimToady> :: is then */
9473                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9474                     op = CUTGROUP;
9475                     RExC_seen |= REG_CUTGROUP_SEEN;
9476                 }
9477                 break;
9478             }
9479             if ( ! op ) {
9480                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9481                 vFAIL2utf8f(
9482                     "Unknown verb pattern '%"UTF8f"'",
9483                     UTF8fARG(UTF, verb_len, start_verb));
9484             }
9485             if ( argok ) {
9486                 if ( start_arg && internal_argval ) {
9487                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9488                         verb_len, start_verb);
9489                 } else if ( argok < 0 && !start_arg ) {
9490                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9491                         verb_len, start_verb);
9492                 } else {
9493                     ret = reganode(pRExC_state, op, internal_argval);
9494                     if ( ! internal_argval && ! SIZE_ONLY ) {
9495                         if (start_arg) {
9496                             SV *sv = newSVpvn( start_arg,
9497                                                RExC_parse - start_arg);
9498                             ARG(ret) = add_data( pRExC_state,
9499                                                  STR_WITH_LEN("S"));
9500                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9501                             ret->flags = 0;
9502                         } else {
9503                             ret->flags = 1;
9504                         }
9505                     }
9506                 }
9507                 if (!internal_argval)
9508                     RExC_seen |= REG_VERBARG_SEEN;
9509             } else if ( start_arg ) {
9510                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9511                         verb_len, start_verb);
9512             } else {
9513                 ret = reg_node(pRExC_state, op);
9514             }
9515             nextchar(pRExC_state);
9516             return ret;
9517         }
9518         else if (*RExC_parse == '?') { /* (?...) */
9519             bool is_logical = 0;
9520             const char * const seqstart = RExC_parse;
9521             if (has_intervening_patws && SIZE_ONLY) {
9522                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9523             }
9524
9525             RExC_parse++;
9526             paren = *RExC_parse++;
9527             ret = NULL;                 /* For look-ahead/behind. */
9528             switch (paren) {
9529
9530             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9531                 paren = *RExC_parse++;
9532                 if ( paren == '<')         /* (?P<...>) named capture */
9533                     goto named_capture;
9534                 else if (paren == '>') {   /* (?P>name) named recursion */
9535                     goto named_recursion;
9536                 }
9537                 else if (paren == '=') {   /* (?P=...)  named backref */
9538                     /* this pretty much dupes the code for \k<NAME> in
9539                      * regatom(), if you change this make sure you change that
9540                      * */
9541                     char* name_start = RExC_parse;
9542                     U32 num = 0;
9543                     SV *sv_dat = reg_scan_name(pRExC_state,
9544                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9545                     if (RExC_parse == name_start || *RExC_parse != ')')
9546                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9547                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9548
9549                     if (!SIZE_ONLY) {
9550                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9551                         RExC_rxi->data->data[num]=(void*)sv_dat;
9552                         SvREFCNT_inc_simple_void(sv_dat);
9553                     }
9554                     RExC_sawback = 1;
9555                     ret = reganode(pRExC_state,
9556                                    ((! FOLD)
9557                                      ? NREF
9558                                      : (ASCII_FOLD_RESTRICTED)
9559                                        ? NREFFA
9560                                        : (AT_LEAST_UNI_SEMANTICS)
9561                                          ? NREFFU
9562                                          : (LOC)
9563                                            ? NREFFL
9564                                            : NREFF),
9565                                     num);
9566                     *flagp |= HASWIDTH;
9567
9568                     Set_Node_Offset(ret, parse_start+1);
9569                     Set_Node_Cur_Length(ret, parse_start);
9570
9571                     nextchar(pRExC_state);
9572                     return ret;
9573                 }
9574                 RExC_parse++;
9575                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9576                 vFAIL3("Sequence (%.*s...) not recognized",
9577                                 RExC_parse-seqstart, seqstart);
9578                 /*NOTREACHED*/
9579             case '<':           /* (?<...) */
9580                 if (*RExC_parse == '!')
9581                     paren = ',';
9582                 else if (*RExC_parse != '=')
9583               named_capture:
9584                 {               /* (?<...>) */
9585                     char *name_start;
9586                     SV *svname;
9587                     paren= '>';
9588             case '\'':          /* (?'...') */
9589                     name_start= RExC_parse;
9590                     svname = reg_scan_name(pRExC_state,
9591                         SIZE_ONLY    /* reverse test from the others */
9592                         ? REG_RSN_RETURN_NAME
9593                         : REG_RSN_RETURN_NULL);
9594                     if (RExC_parse == name_start || *RExC_parse != paren)
9595                         vFAIL2("Sequence (?%c... not terminated",
9596                             paren=='>' ? '<' : paren);
9597                     if (SIZE_ONLY) {
9598                         HE *he_str;
9599                         SV *sv_dat = NULL;
9600                         if (!svname) /* shouldn't happen */
9601                             Perl_croak(aTHX_
9602                                 "panic: reg_scan_name returned NULL");
9603                         if (!RExC_paren_names) {
9604                             RExC_paren_names= newHV();
9605                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9606 #ifdef DEBUGGING
9607                             RExC_paren_name_list= newAV();
9608                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9609 #endif
9610                         }
9611                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9612                         if ( he_str )
9613                             sv_dat = HeVAL(he_str);
9614                         if ( ! sv_dat ) {
9615                             /* croak baby croak */
9616                             Perl_croak(aTHX_
9617                                 "panic: paren_name hash element allocation failed");
9618                         } else if ( SvPOK(sv_dat) ) {
9619                             /* (?|...) can mean we have dupes so scan to check
9620                                its already been stored. Maybe a flag indicating
9621                                we are inside such a construct would be useful,
9622                                but the arrays are likely to be quite small, so
9623                                for now we punt -- dmq */
9624                             IV count = SvIV(sv_dat);
9625                             I32 *pv = (I32*)SvPVX(sv_dat);
9626                             IV i;
9627                             for ( i = 0 ; i < count ; i++ ) {
9628                                 if ( pv[i] == RExC_npar ) {
9629                                     count = 0;
9630                                     break;
9631                                 }
9632                             }
9633                             if ( count ) {
9634                                 pv = (I32*)SvGROW(sv_dat,
9635                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9636                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9637                                 pv[count] = RExC_npar;
9638                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9639                             }
9640                         } else {
9641                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9642                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9643                                                                 sizeof(I32));
9644                             SvIOK_on(sv_dat);
9645                             SvIV_set(sv_dat, 1);
9646                         }
9647 #ifdef DEBUGGING
9648                         /* Yes this does cause a memory leak in debugging Perls
9649                          * */
9650                         if (!av_store(RExC_paren_name_list,
9651                                       RExC_npar, SvREFCNT_inc(svname)))
9652                             SvREFCNT_dec_NN(svname);
9653 #endif
9654
9655                         /*sv_dump(sv_dat);*/
9656                     }
9657                     nextchar(pRExC_state);
9658                     paren = 1;
9659                     goto capturing_parens;
9660                 }
9661                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9662                 RExC_in_lookbehind++;
9663                 RExC_parse++;
9664             case '=':           /* (?=...) */
9665                 RExC_seen_zerolen++;
9666                 break;
9667             case '!':           /* (?!...) */
9668                 RExC_seen_zerolen++;
9669                 if (*RExC_parse == ')') {
9670                     ret=reg_node(pRExC_state, OPFAIL);
9671                     nextchar(pRExC_state);
9672                     return ret;
9673                 }
9674                 break;
9675             case '|':           /* (?|...) */
9676                 /* branch reset, behave like a (?:...) except that
9677                    buffers in alternations share the same numbers */
9678                 paren = ':';
9679                 after_freeze = freeze_paren = RExC_npar;
9680                 break;
9681             case ':':           /* (?:...) */
9682             case '>':           /* (?>...) */
9683                 break;
9684             case '$':           /* (?$...) */
9685             case '@':           /* (?@...) */
9686                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9687                 break;
9688             case '#':           /* (?#...) */
9689                 /* XXX As soon as we disallow separating the '?' and '*' (by
9690                  * spaces or (?#...) comment), it is believed that this case
9691                  * will be unreachable and can be removed.  See
9692                  * [perl #117327] */
9693                 while (*RExC_parse && *RExC_parse != ')')
9694                     RExC_parse++;
9695                 if (*RExC_parse != ')')
9696                     FAIL("Sequence (?#... not terminated");
9697                 nextchar(pRExC_state);
9698                 *flagp = TRYAGAIN;
9699                 return NULL;
9700             case '0' :           /* (?0) */
9701             case 'R' :           /* (?R) */
9702                 if (*RExC_parse != ')')
9703                     FAIL("Sequence (?R) not terminated");
9704                 ret = reg_node(pRExC_state, GOSTART);
9705                     RExC_seen |= REG_GOSTART_SEEN;
9706                 *flagp |= POSTPONED;
9707                 nextchar(pRExC_state);
9708                 return ret;
9709                 /*notreached*/
9710             { /* named and numeric backreferences */
9711                 I32 num;
9712             case '&':            /* (?&NAME) */
9713                 parse_start = RExC_parse - 1;
9714               named_recursion:
9715                 {
9716                     SV *sv_dat = reg_scan_name(pRExC_state,
9717                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9718                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9719                 }
9720                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9721                     vFAIL("Sequence (?&... not terminated");
9722                 goto gen_recurse_regop;
9723                 assert(0); /* NOT REACHED */
9724             case '+':
9725                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9726                     RExC_parse++;
9727                     vFAIL("Illegal pattern");
9728                 }
9729                 goto parse_recursion;
9730                 /* NOT REACHED*/
9731             case '-': /* (?-1) */
9732                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9733                     RExC_parse--; /* rewind to let it be handled later */
9734                     goto parse_flags;
9735                 }
9736                 /*FALLTHROUGH */
9737             case '1': case '2': case '3': case '4': /* (?1) */
9738             case '5': case '6': case '7': case '8': case '9':
9739                 RExC_parse--;
9740               parse_recursion:
9741                 num = atoi(RExC_parse);
9742                 parse_start = RExC_parse - 1; /* MJD */
9743                 if (*RExC_parse == '-')
9744                     RExC_parse++;
9745                 while (isDIGIT(*RExC_parse))
9746                         RExC_parse++;
9747                 if (*RExC_parse!=')')
9748                     vFAIL("Expecting close bracket");
9749
9750               gen_recurse_regop:
9751                 if ( paren == '-' ) {
9752                     /*
9753                     Diagram of capture buffer numbering.
9754                     Top line is the normal capture buffer numbers
9755                     Bottom line is the negative indexing as from
9756                     the X (the (?-2))
9757
9758                     +   1 2    3 4 5 X          6 7
9759                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9760                     -   5 4    3 2 1 X          x x
9761
9762                     */
9763                     num = RExC_npar + num;
9764                     if (num < 1)  {
9765                         RExC_parse++;
9766                         vFAIL("Reference to nonexistent group");
9767                     }
9768                 } else if ( paren == '+' ) {
9769                     num = RExC_npar + num - 1;
9770                 }
9771
9772                 ret = reganode(pRExC_state, GOSUB, num);
9773                 if (!SIZE_ONLY) {
9774                     if (num > (I32)RExC_rx->nparens) {
9775                         RExC_parse++;
9776                         vFAIL("Reference to nonexistent group");
9777                     }
9778                     ARG2L_SET( ret, RExC_recurse_count++);
9779                     RExC_emit++;
9780                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9781                         "Recurse #%"UVuf" to %"IVdf"\n",
9782                               (UV)ARG(ret), (IV)ARG2L(ret)));
9783                 } else {
9784                     RExC_size++;
9785                 }
9786                     RExC_seen |= REG_RECURSE_SEEN;
9787                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9788                 Set_Node_Offset(ret, parse_start); /* MJD */
9789
9790                 *flagp |= POSTPONED;
9791                 nextchar(pRExC_state);
9792                 return ret;
9793             } /* named and numeric backreferences */
9794             assert(0); /* NOT REACHED */
9795
9796             case '?':           /* (??...) */
9797                 is_logical = 1;
9798                 if (*RExC_parse != '{') {
9799                     RExC_parse++;
9800                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9801                     vFAIL2utf8f(
9802                         "Sequence (%"UTF8f"...) not recognized",
9803                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9804                     /*NOTREACHED*/
9805                 }
9806                 *flagp |= POSTPONED;
9807                 paren = *RExC_parse++;
9808                 /* FALL THROUGH */
9809             case '{':           /* (?{...}) */
9810             {
9811                 U32 n = 0;
9812                 struct reg_code_block *cb;
9813
9814                 RExC_seen_zerolen++;
9815
9816                 if (   !pRExC_state->num_code_blocks
9817                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9818                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9819                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9820                             - RExC_start)
9821                 ) {
9822                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9823                         FAIL("panic: Sequence (?{...}): no code block found\n");
9824                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9825                 }
9826                 /* this is a pre-compiled code block (?{...}) */
9827                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9828                 RExC_parse = RExC_start + cb->end;
9829                 if (!SIZE_ONLY) {
9830                     OP *o = cb->block;
9831                     if (cb->src_regex) {
9832                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9833                         RExC_rxi->data->data[n] =
9834                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9835                         RExC_rxi->data->data[n+1] = (void*)o;
9836                     }
9837                     else {
9838                         n = add_data(pRExC_state,
9839                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9840                         RExC_rxi->data->data[n] = (void*)o;
9841                     }
9842                 }
9843                 pRExC_state->code_index++;
9844                 nextchar(pRExC_state);
9845
9846                 if (is_logical) {
9847                     regnode *eval;
9848                     ret = reg_node(pRExC_state, LOGICAL);
9849                     eval = reganode(pRExC_state, EVAL, n);
9850                     if (!SIZE_ONLY) {
9851                         ret->flags = 2;
9852                         /* for later propagation into (??{}) return value */
9853                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9854                     }
9855                     REGTAIL(pRExC_state, ret, eval);
9856                     /* deal with the length of this later - MJD */
9857                     return ret;
9858                 }
9859                 ret = reganode(pRExC_state, EVAL, n);
9860                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9861                 Set_Node_Offset(ret, parse_start);
9862                 return ret;
9863             }
9864             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9865             {
9866                 int is_define= 0;
9867                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9868                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9869                         || RExC_parse[1] == '<'
9870                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9871                         I32 flag;
9872                         regnode *tail;
9873
9874                         ret = reg_node(pRExC_state, LOGICAL);
9875                         if (!SIZE_ONLY)
9876                             ret->flags = 1;
9877
9878                         tail = reg(pRExC_state, 1, &flag, depth+1);
9879                         if (flag & RESTART_UTF8) {
9880                             *flagp = RESTART_UTF8;
9881                             return NULL;
9882                         }
9883                         REGTAIL(pRExC_state, ret, tail);
9884                         goto insert_if;
9885                     }
9886                 }
9887                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9888                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9889                 {
9890                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9891                     char *name_start= RExC_parse++;
9892                     U32 num = 0;
9893                     SV *sv_dat=reg_scan_name(pRExC_state,
9894                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9895                     if (RExC_parse == name_start || *RExC_parse != ch)
9896                         vFAIL2("Sequence (?(%c... not terminated",
9897                             (ch == '>' ? '<' : ch));
9898                     RExC_parse++;
9899                     if (!SIZE_ONLY) {
9900                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9901                         RExC_rxi->data->data[num]=(void*)sv_dat;
9902                         SvREFCNT_inc_simple_void(sv_dat);
9903                     }
9904                     ret = reganode(pRExC_state,NGROUPP,num);
9905                     goto insert_if_check_paren;
9906                 }
9907                 else if (RExC_parse[0] == 'D' &&
9908                          RExC_parse[1] == 'E' &&
9909                          RExC_parse[2] == 'F' &&
9910                          RExC_parse[3] == 'I' &&
9911                          RExC_parse[4] == 'N' &&
9912                          RExC_parse[5] == 'E')
9913                 {
9914                     ret = reganode(pRExC_state,DEFINEP,0);
9915                     RExC_parse +=6 ;
9916                     is_define = 1;
9917                     goto insert_if_check_paren;
9918                 }
9919                 else if (RExC_parse[0] == 'R') {
9920                     RExC_parse++;
9921                     parno = 0;
9922                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9923                         parno = atoi(RExC_parse++);
9924                         while (isDIGIT(*RExC_parse))
9925                             RExC_parse++;
9926                     } else if (RExC_parse[0] == '&') {
9927                         SV *sv_dat;
9928                         RExC_parse++;
9929                         sv_dat = reg_scan_name(pRExC_state,
9930                             SIZE_ONLY
9931                             ? REG_RSN_RETURN_NULL
9932                             : REG_RSN_RETURN_DATA);
9933                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9934                     }
9935                     ret = reganode(pRExC_state,INSUBP,parno);
9936                     goto insert_if_check_paren;
9937                 }
9938                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9939                     /* (?(1)...) */
9940                     char c;
9941                     char *tmp;
9942                     parno = atoi(RExC_parse++);
9943
9944                     while (isDIGIT(*RExC_parse))
9945                         RExC_parse++;
9946                     ret = reganode(pRExC_state, GROUPP, parno);
9947
9948                  insert_if_check_paren:
9949                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9950                         /* nextchar also skips comments, so undo its work
9951                          * and skip over the the next character.
9952                          */
9953                         RExC_parse = tmp;
9954                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9955                         vFAIL("Switch condition not recognized");
9956                     }
9957                   insert_if:
9958                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9959                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9960                     if (br == NULL) {
9961                         if (flags & RESTART_UTF8) {
9962                             *flagp = RESTART_UTF8;
9963                             return NULL;
9964                         }
9965                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9966                               (UV) flags);
9967                     } else
9968                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
9969                                                           LONGJMP, 0));
9970                     c = *nextchar(pRExC_state);
9971                     if (flags&HASWIDTH)
9972                         *flagp |= HASWIDTH;
9973                     if (c == '|') {
9974                         if (is_define)
9975                             vFAIL("(?(DEFINE)....) does not allow branches");
9976
9977                         /* Fake one for optimizer.  */
9978                         lastbr = reganode(pRExC_state, IFTHEN, 0);
9979
9980                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9981                             if (flags & RESTART_UTF8) {
9982                                 *flagp = RESTART_UTF8;
9983                                 return NULL;
9984                             }
9985                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9986                                   (UV) flags);
9987                         }
9988                         REGTAIL(pRExC_state, ret, lastbr);
9989                         if (flags&HASWIDTH)
9990                             *flagp |= HASWIDTH;
9991                         c = *nextchar(pRExC_state);
9992                     }
9993                     else
9994                         lastbr = NULL;
9995                     if (c != ')')
9996                         vFAIL("Switch (?(condition)... contains too many branches");
9997                     ender = reg_node(pRExC_state, TAIL);
9998                     REGTAIL(pRExC_state, br, ender);
9999                     if (lastbr) {
10000                         REGTAIL(pRExC_state, lastbr, ender);
10001                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10002                     }
10003                     else
10004                         REGTAIL(pRExC_state, ret, ender);
10005                     RExC_size++; /* XXX WHY do we need this?!!
10006                                     For large programs it seems to be required
10007                                     but I can't figure out why. -- dmq*/
10008                     return ret;
10009                 }
10010                 else {
10011                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10012                     vFAIL("Unknown switch condition (?(...))");
10013                 }
10014             }
10015             case '[':           /* (?[ ... ]) */
10016                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10017                                          oregcomp_parse);
10018             case 0:
10019                 RExC_parse--; /* for vFAIL to print correctly */
10020                 vFAIL("Sequence (? incomplete");
10021                 break;
10022             default: /* e.g., (?i) */
10023                 --RExC_parse;
10024               parse_flags:
10025                 parse_lparen_question_flags(pRExC_state);
10026                 if (UCHARAT(RExC_parse) != ':') {
10027                     nextchar(pRExC_state);
10028                     *flagp = TRYAGAIN;
10029                     return NULL;
10030                 }
10031                 paren = ':';
10032                 nextchar(pRExC_state);
10033                 ret = NULL;
10034                 goto parse_rest;
10035             } /* end switch */
10036         }
10037         else {                  /* (...) */
10038           capturing_parens:
10039             parno = RExC_npar;
10040             RExC_npar++;
10041
10042             ret = reganode(pRExC_state, OPEN, parno);
10043             if (!SIZE_ONLY ){
10044                 if (!RExC_nestroot)
10045                     RExC_nestroot = parno;
10046                 if (RExC_seen & REG_RECURSE_SEEN
10047                     && !RExC_open_parens[parno-1])
10048                 {
10049                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10050                         "Setting open paren #%"IVdf" to %d\n",
10051                         (IV)parno, REG_NODE_NUM(ret)));
10052                     RExC_open_parens[parno-1]= ret;
10053                 }
10054             }
10055             Set_Node_Length(ret, 1); /* MJD */
10056             Set_Node_Offset(ret, RExC_parse); /* MJD */
10057             is_open = 1;
10058         }
10059     }
10060     else                        /* ! paren */
10061         ret = NULL;
10062
10063    parse_rest:
10064     /* Pick up the branches, linking them together. */
10065     parse_start = RExC_parse;   /* MJD */
10066     br = regbranch(pRExC_state, &flags, 1,depth+1);
10067
10068     /*     branch_len = (paren != 0); */
10069
10070     if (br == NULL) {
10071         if (flags & RESTART_UTF8) {
10072             *flagp = RESTART_UTF8;
10073             return NULL;
10074         }
10075         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10076     }
10077     if (*RExC_parse == '|') {
10078         if (!SIZE_ONLY && RExC_extralen) {
10079             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10080         }
10081         else {                  /* MJD */
10082             reginsert(pRExC_state, BRANCH, br, depth+1);
10083             Set_Node_Length(br, paren != 0);
10084             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10085         }
10086         have_branch = 1;
10087         if (SIZE_ONLY)
10088             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10089     }
10090     else if (paren == ':') {
10091         *flagp |= flags&SIMPLE;
10092     }
10093     if (is_open) {                              /* Starts with OPEN. */
10094         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10095     }
10096     else if (paren != '?')              /* Not Conditional */
10097         ret = br;
10098     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10099     lastbr = br;
10100     while (*RExC_parse == '|') {
10101         if (!SIZE_ONLY && RExC_extralen) {
10102             ender = reganode(pRExC_state, LONGJMP,0);
10103
10104             /* Append to the previous. */
10105             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10106         }
10107         if (SIZE_ONLY)
10108             RExC_extralen += 2;         /* Account for LONGJMP. */
10109         nextchar(pRExC_state);
10110         if (freeze_paren) {
10111             if (RExC_npar > after_freeze)
10112                 after_freeze = RExC_npar;
10113             RExC_npar = freeze_paren;
10114         }
10115         br = regbranch(pRExC_state, &flags, 0, depth+1);
10116
10117         if (br == NULL) {
10118             if (flags & RESTART_UTF8) {
10119                 *flagp = RESTART_UTF8;
10120                 return NULL;
10121             }
10122             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10123         }
10124         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10125         lastbr = br;
10126         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10127     }
10128
10129     if (have_branch || paren != ':') {
10130         /* Make a closing node, and hook it on the end. */
10131         switch (paren) {
10132         case ':':
10133             ender = reg_node(pRExC_state, TAIL);
10134             break;
10135         case 1: case 2:
10136             ender = reganode(pRExC_state, CLOSE, parno);
10137             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10138                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10139                         "Setting close paren #%"IVdf" to %d\n",
10140                         (IV)parno, REG_NODE_NUM(ender)));
10141                 RExC_close_parens[parno-1]= ender;
10142                 if (RExC_nestroot == parno)
10143                     RExC_nestroot = 0;
10144             }
10145             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10146             Set_Node_Length(ender,1); /* MJD */
10147             break;
10148         case '<':
10149         case ',':
10150         case '=':
10151         case '!':
10152             *flagp &= ~HASWIDTH;
10153             /* FALL THROUGH */
10154         case '>':
10155             ender = reg_node(pRExC_state, SUCCEED);
10156             break;
10157         case 0:
10158             ender = reg_node(pRExC_state, END);
10159             if (!SIZE_ONLY) {
10160                 assert(!RExC_opend); /* there can only be one! */
10161                 RExC_opend = ender;
10162             }
10163             break;
10164         }
10165         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10166             SV * const mysv_val1=sv_newmortal();
10167             SV * const mysv_val2=sv_newmortal();
10168             DEBUG_PARSE_MSG("lsbr");
10169             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10170             regprop(RExC_rx, mysv_val2, ender, NULL);
10171             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10172                           SvPV_nolen_const(mysv_val1),
10173                           (IV)REG_NODE_NUM(lastbr),
10174                           SvPV_nolen_const(mysv_val2),
10175                           (IV)REG_NODE_NUM(ender),
10176                           (IV)(ender - lastbr)
10177             );
10178         });
10179         REGTAIL(pRExC_state, lastbr, ender);
10180
10181         if (have_branch && !SIZE_ONLY) {
10182             char is_nothing= 1;
10183             if (depth==1)
10184                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10185
10186             /* Hook the tails of the branches to the closing node. */
10187             for (br = ret; br; br = regnext(br)) {
10188                 const U8 op = PL_regkind[OP(br)];
10189                 if (op == BRANCH) {
10190                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10191                     if ( OP(NEXTOPER(br)) != NOTHING
10192                          || regnext(NEXTOPER(br)) != ender)
10193                         is_nothing= 0;
10194                 }
10195                 else if (op == BRANCHJ) {
10196                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10197                     /* for now we always disable this optimisation * /
10198                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10199                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10200                     */
10201                         is_nothing= 0;
10202                 }
10203             }
10204             if (is_nothing) {
10205                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10206                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10207                     SV * const mysv_val1=sv_newmortal();
10208                     SV * const mysv_val2=sv_newmortal();
10209                     DEBUG_PARSE_MSG("NADA");
10210                     regprop(RExC_rx, mysv_val1, ret, NULL);
10211                     regprop(RExC_rx, mysv_val2, ender, NULL);
10212                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10213                                   SvPV_nolen_const(mysv_val1),
10214                                   (IV)REG_NODE_NUM(ret),
10215                                   SvPV_nolen_const(mysv_val2),
10216                                   (IV)REG_NODE_NUM(ender),
10217                                   (IV)(ender - ret)
10218                     );
10219                 });
10220                 OP(br)= NOTHING;
10221                 if (OP(ender) == TAIL) {
10222                     NEXT_OFF(br)= 0;
10223                     RExC_emit= br + 1;
10224                 } else {
10225                     regnode *opt;
10226                     for ( opt= br + 1; opt < ender ; opt++ )
10227                         OP(opt)= OPTIMIZED;
10228                     NEXT_OFF(br)= ender - br;
10229                 }
10230             }
10231         }
10232     }
10233
10234     {
10235         const char *p;
10236         static const char parens[] = "=!<,>";
10237
10238         if (paren && (p = strchr(parens, paren))) {
10239             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10240             int flag = (p - parens) > 1;
10241
10242             if (paren == '>')
10243                 node = SUSPEND, flag = 0;
10244             reginsert(pRExC_state, node,ret, depth+1);
10245             Set_Node_Cur_Length(ret, parse_start);
10246             Set_Node_Offset(ret, parse_start + 1);
10247             ret->flags = flag;
10248             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10249         }
10250     }
10251
10252     /* Check for proper termination. */
10253     if (paren) {
10254         /* restore original flags, but keep (?p) */
10255         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10256         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10257             RExC_parse = oregcomp_parse;
10258             vFAIL("Unmatched (");
10259         }
10260     }
10261     else if (!paren && RExC_parse < RExC_end) {
10262         if (*RExC_parse == ')') {
10263             RExC_parse++;
10264             vFAIL("Unmatched )");
10265         }
10266         else
10267             FAIL("Junk on end of regexp");      /* "Can't happen". */
10268         assert(0); /* NOTREACHED */
10269     }
10270
10271     if (RExC_in_lookbehind) {
10272         RExC_in_lookbehind--;
10273     }
10274     if (after_freeze > RExC_npar)
10275         RExC_npar = after_freeze;
10276     return(ret);
10277 }
10278
10279 /*
10280  - regbranch - one alternative of an | operator
10281  *
10282  * Implements the concatenation operator.
10283  *
10284  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10285  * restarted.
10286  */
10287 STATIC regnode *
10288 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10289 {
10290     dVAR;
10291     regnode *ret;
10292     regnode *chain = NULL;
10293     regnode *latest;
10294     I32 flags = 0, c = 0;
10295     GET_RE_DEBUG_FLAGS_DECL;
10296
10297     PERL_ARGS_ASSERT_REGBRANCH;
10298
10299     DEBUG_PARSE("brnc");
10300
10301     if (first)
10302         ret = NULL;
10303     else {
10304         if (!SIZE_ONLY && RExC_extralen)
10305             ret = reganode(pRExC_state, BRANCHJ,0);
10306         else {
10307             ret = reg_node(pRExC_state, BRANCH);
10308             Set_Node_Length(ret, 1);
10309         }
10310     }
10311
10312     if (!first && SIZE_ONLY)
10313         RExC_extralen += 1;                     /* BRANCHJ */
10314
10315     *flagp = WORST;                     /* Tentatively. */
10316
10317     RExC_parse--;
10318     nextchar(pRExC_state);
10319     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10320         flags &= ~TRYAGAIN;
10321         latest = regpiece(pRExC_state, &flags,depth+1);
10322         if (latest == NULL) {
10323             if (flags & TRYAGAIN)
10324                 continue;
10325             if (flags & RESTART_UTF8) {
10326                 *flagp = RESTART_UTF8;
10327                 return NULL;
10328             }
10329             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10330         }
10331         else if (ret == NULL)
10332             ret = latest;
10333         *flagp |= flags&(HASWIDTH|POSTPONED);
10334         if (chain == NULL)      /* First piece. */
10335             *flagp |= flags&SPSTART;
10336         else {
10337             RExC_naughty++;
10338             REGTAIL(pRExC_state, chain, latest);
10339         }
10340         chain = latest;
10341         c++;
10342     }
10343     if (chain == NULL) {        /* Loop ran zero times. */
10344         chain = reg_node(pRExC_state, NOTHING);
10345         if (ret == NULL)
10346             ret = chain;
10347     }
10348     if (c == 1) {
10349         *flagp |= flags&SIMPLE;
10350     }
10351
10352     return ret;
10353 }
10354
10355 /*
10356  - regpiece - something followed by possible [*+?]
10357  *
10358  * Note that the branching code sequences used for ? and the general cases
10359  * of * and + are somewhat optimized:  they use the same NOTHING node as
10360  * both the endmarker for their branch list and the body of the last branch.
10361  * It might seem that this node could be dispensed with entirely, but the
10362  * endmarker role is not redundant.
10363  *
10364  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10365  * TRYAGAIN.
10366  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10367  * restarted.
10368  */
10369 STATIC regnode *
10370 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10371 {
10372     dVAR;
10373     regnode *ret;
10374     char op;
10375     char *next;
10376     I32 flags;
10377     const char * const origparse = RExC_parse;
10378     I32 min;
10379     I32 max = REG_INFTY;
10380 #ifdef RE_TRACK_PATTERN_OFFSETS
10381     char *parse_start;
10382 #endif
10383     const char *maxpos = NULL;
10384
10385     /* Save the original in case we change the emitted regop to a FAIL. */
10386     regnode * const orig_emit = RExC_emit;
10387
10388     GET_RE_DEBUG_FLAGS_DECL;
10389
10390     PERL_ARGS_ASSERT_REGPIECE;
10391
10392     DEBUG_PARSE("piec");
10393
10394     ret = regatom(pRExC_state, &flags,depth+1);
10395     if (ret == NULL) {
10396         if (flags & (TRYAGAIN|RESTART_UTF8))
10397             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10398         else
10399             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10400         return(NULL);
10401     }
10402
10403     op = *RExC_parse;
10404
10405     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10406         maxpos = NULL;
10407 #ifdef RE_TRACK_PATTERN_OFFSETS
10408         parse_start = RExC_parse; /* MJD */
10409 #endif
10410         next = RExC_parse + 1;
10411         while (isDIGIT(*next) || *next == ',') {
10412             if (*next == ',') {
10413                 if (maxpos)
10414                     break;
10415                 else
10416                     maxpos = next;
10417             }
10418             next++;
10419         }
10420         if (*next == '}') {             /* got one */
10421             if (!maxpos)
10422                 maxpos = next;
10423             RExC_parse++;
10424             min = atoi(RExC_parse);
10425             if (*maxpos == ',')
10426                 maxpos++;
10427             else
10428                 maxpos = RExC_parse;
10429             max = atoi(maxpos);
10430             if (!max && *maxpos != '0')
10431                 max = REG_INFTY;                /* meaning "infinity" */
10432             else if (max >= REG_INFTY)
10433                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10434             RExC_parse = next;
10435             nextchar(pRExC_state);
10436             if (max < min) {    /* If can't match, warn and optimize to fail
10437                                    unconditionally */
10438                 if (SIZE_ONLY) {
10439                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10440
10441                     /* We can't back off the size because we have to reserve
10442                      * enough space for all the things we are about to throw
10443                      * away, but we can shrink it by the ammount we are about
10444                      * to re-use here */
10445                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10446                 }
10447                 else {
10448                     RExC_emit = orig_emit;
10449                 }
10450                 ret = reg_node(pRExC_state, OPFAIL);
10451                 return ret;
10452             }
10453             else if (min == max
10454                      && RExC_parse < RExC_end
10455                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10456             {
10457                 if (SIZE_ONLY) {
10458                     ckWARN2reg(RExC_parse + 1,
10459                                "Useless use of greediness modifier '%c'",
10460                                *RExC_parse);
10461                 }
10462                 /* Absorb the modifier, so later code doesn't see nor use
10463                     * it */
10464                 nextchar(pRExC_state);
10465             }
10466
10467         do_curly:
10468             if ((flags&SIMPLE)) {
10469                 RExC_naughty += 2 + RExC_naughty / 2;
10470                 reginsert(pRExC_state, CURLY, ret, depth+1);
10471                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10472                 Set_Node_Cur_Length(ret, parse_start);
10473             }
10474             else {
10475                 regnode * const w = reg_node(pRExC_state, WHILEM);
10476
10477                 w->flags = 0;
10478                 REGTAIL(pRExC_state, ret, w);
10479                 if (!SIZE_ONLY && RExC_extralen) {
10480                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10481                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10482                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10483                 }
10484                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10485                                 /* MJD hk */
10486                 Set_Node_Offset(ret, parse_start+1);
10487                 Set_Node_Length(ret,
10488                                 op == '{' ? (RExC_parse - parse_start) : 1);
10489
10490                 if (!SIZE_ONLY && RExC_extralen)
10491                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10492                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10493                 if (SIZE_ONLY)
10494                     RExC_whilem_seen++, RExC_extralen += 3;
10495                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10496             }
10497             ret->flags = 0;
10498
10499             if (min > 0)
10500                 *flagp = WORST;
10501             if (max > 0)
10502                 *flagp |= HASWIDTH;
10503             if (!SIZE_ONLY) {
10504                 ARG1_SET(ret, (U16)min);
10505                 ARG2_SET(ret, (U16)max);
10506             }
10507             if (max == REG_INFTY)
10508                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10509
10510             goto nest_check;
10511         }
10512     }
10513
10514     if (!ISMULT1(op)) {
10515         *flagp = flags;
10516         return(ret);
10517     }
10518
10519 #if 0                           /* Now runtime fix should be reliable. */
10520
10521     /* if this is reinstated, don't forget to put this back into perldiag:
10522
10523             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10524
10525            (F) The part of the regexp subject to either the * or + quantifier
10526            could match an empty string. The {#} shows in the regular
10527            expression about where the problem was discovered.
10528
10529     */
10530
10531     if (!(flags&HASWIDTH) && op != '?')
10532       vFAIL("Regexp *+ operand could be empty");
10533 #endif
10534
10535 #ifdef RE_TRACK_PATTERN_OFFSETS
10536     parse_start = RExC_parse;
10537 #endif
10538     nextchar(pRExC_state);
10539
10540     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10541
10542     if (op == '*' && (flags&SIMPLE)) {
10543         reginsert(pRExC_state, STAR, ret, depth+1);
10544         ret->flags = 0;
10545         RExC_naughty += 4;
10546         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10547     }
10548     else if (op == '*') {
10549         min = 0;
10550         goto do_curly;
10551     }
10552     else if (op == '+' && (flags&SIMPLE)) {
10553         reginsert(pRExC_state, PLUS, ret, depth+1);
10554         ret->flags = 0;
10555         RExC_naughty += 3;
10556         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10557     }
10558     else if (op == '+') {
10559         min = 1;
10560         goto do_curly;
10561     }
10562     else if (op == '?') {
10563         min = 0; max = 1;
10564         goto do_curly;
10565     }
10566   nest_check:
10567     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10568         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10569         ckWARN2reg(RExC_parse,
10570                    "%"UTF8f" matches null string many times",
10571                    UTF8fARG(UTF, (RExC_parse >= origparse
10572                                  ? RExC_parse - origparse
10573                                  : 0),
10574                    origparse));
10575         (void)ReREFCNT_inc(RExC_rx_sv);
10576     }
10577
10578     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10579         nextchar(pRExC_state);
10580         reginsert(pRExC_state, MINMOD, ret, depth+1);
10581         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10582     }
10583     else
10584     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10585         regnode *ender;
10586         nextchar(pRExC_state);
10587         ender = reg_node(pRExC_state, SUCCEED);
10588         REGTAIL(pRExC_state, ret, ender);
10589         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10590         ret->flags = 0;
10591         ender = reg_node(pRExC_state, TAIL);
10592         REGTAIL(pRExC_state, ret, ender);
10593     }
10594
10595     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10596         RExC_parse++;
10597         vFAIL("Nested quantifiers");
10598     }
10599
10600     return(ret);
10601 }
10602
10603 STATIC bool
10604 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10605                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10606                       const bool strict   /* Apply stricter parsing rules? */
10607     )
10608 {
10609
10610  /* This is expected to be called by a parser routine that has recognized '\N'
10611    and needs to handle the rest. RExC_parse is expected to point at the first
10612    char following the N at the time of the call.  On successful return,
10613    RExC_parse has been updated to point to just after the sequence identified
10614    by this routine, and <*flagp> has been updated.
10615
10616    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10617    character class.
10618
10619    \N may begin either a named sequence, or if outside a character class, mean
10620    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10621    attempted to decide which, and in the case of a named sequence, converted it
10622    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10623    where c1... are the characters in the sequence.  For single-quoted regexes,
10624    the tokenizer passes the \N sequence through unchanged; this code will not
10625    attempt to determine this nor expand those, instead raising a syntax error.
10626    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10627    or there is no '}', it signals that this \N occurrence means to match a
10628    non-newline.
10629
10630    Only the \N{U+...} form should occur in a character class, for the same
10631    reason that '.' inside a character class means to just match a period: it
10632    just doesn't make sense.
10633
10634    The function raises an error (via vFAIL), and doesn't return for various
10635    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10636    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10637    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10638    only possible if node_p is non-NULL.
10639
10640
10641    If <valuep> is non-null, it means the caller can accept an input sequence
10642    consisting of a just a single code point; <*valuep> is set to that value
10643    if the input is such.
10644
10645    If <node_p> is non-null it signifies that the caller can accept any other
10646    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10647    is set as follows:
10648     1) \N means not-a-NL: points to a newly created REG_ANY node;
10649     2) \N{}:              points to a new NOTHING node;
10650     3) otherwise:         points to a new EXACT node containing the resolved
10651                           string.
10652    Note that FALSE is returned for single code point sequences if <valuep> is
10653    null.
10654  */
10655
10656     char * endbrace;    /* '}' following the name */
10657     char* p;
10658     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10659                            stream */
10660     bool has_multiple_chars; /* true if the input stream contains a sequence of
10661                                 more than one character */
10662
10663     GET_RE_DEBUG_FLAGS_DECL;
10664
10665     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10666
10667     GET_RE_DEBUG_FLAGS;
10668
10669     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10670
10671     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10672      * modifier.  The other meaning does not, so use a temporary until we find
10673      * out which we are being called with */
10674     p = (RExC_flags & RXf_PMf_EXTENDED)
10675         ? regwhite( pRExC_state, RExC_parse )
10676         : RExC_parse;
10677
10678     /* Disambiguate between \N meaning a named character versus \N meaning
10679      * [^\n].  The former is assumed when it can't be the latter. */
10680     if (*p != '{' || regcurly(p, FALSE)) {
10681         RExC_parse = p;
10682         if (! node_p) {
10683             /* no bare \N allowed in a charclass */
10684             if (in_char_class) {
10685                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10686             }
10687             return FALSE;
10688         }
10689         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10690                            current char */
10691         nextchar(pRExC_state);
10692         *node_p = reg_node(pRExC_state, REG_ANY);
10693         *flagp |= HASWIDTH|SIMPLE;
10694         RExC_naughty++;
10695         Set_Node_Length(*node_p, 1); /* MJD */
10696         return TRUE;
10697     }
10698
10699     /* Here, we have decided it should be a named character or sequence */
10700
10701     /* The test above made sure that the next real character is a '{', but
10702      * under the /x modifier, it could be separated by space (or a comment and
10703      * \n) and this is not allowed (for consistency with \x{...} and the
10704      * tokenizer handling of \N{NAME}). */
10705     if (*RExC_parse != '{') {
10706         vFAIL("Missing braces on \\N{}");
10707     }
10708
10709     RExC_parse++;       /* Skip past the '{' */
10710
10711     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10712         || ! (endbrace == RExC_parse            /* nothing between the {} */
10713               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10714                                                  */
10715                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10716                                                      */
10717     {
10718         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10719         vFAIL("\\N{NAME} must be resolved by the lexer");
10720     }
10721
10722     if (endbrace == RExC_parse) {   /* empty: \N{} */
10723         bool ret = TRUE;
10724         if (node_p) {
10725             *node_p = reg_node(pRExC_state,NOTHING);
10726         }
10727         else if (in_char_class) {
10728             if (SIZE_ONLY && in_char_class) {
10729                 if (strict) {
10730                     RExC_parse++;   /* Position after the "}" */
10731                     vFAIL("Zero length \\N{}");
10732                 }
10733                 else {
10734                     ckWARNreg(RExC_parse,
10735                               "Ignoring zero length \\N{} in character class");
10736                 }
10737             }
10738             ret = FALSE;
10739         }
10740         else {
10741             return FALSE;
10742         }
10743         nextchar(pRExC_state);
10744         return ret;
10745     }
10746
10747     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10748     RExC_parse += 2;    /* Skip past the 'U+' */
10749
10750     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10751
10752     /* Code points are separated by dots.  If none, there is only one code
10753      * point, and is terminated by the brace */
10754     has_multiple_chars = (endchar < endbrace);
10755
10756     if (valuep && (! has_multiple_chars || in_char_class)) {
10757         /* We only pay attention to the first char of
10758         multichar strings being returned in char classes. I kinda wonder
10759         if this makes sense as it does change the behaviour
10760         from earlier versions, OTOH that behaviour was broken
10761         as well. XXX Solution is to recharacterize as
10762         [rest-of-class]|multi1|multi2... */
10763
10764         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10765         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10766             | PERL_SCAN_DISALLOW_PREFIX
10767             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10768
10769         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10770
10771         /* The tokenizer should have guaranteed validity, but it's possible to
10772          * bypass it by using single quoting, so check */
10773         if (length_of_hex == 0
10774             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10775         {
10776             RExC_parse += length_of_hex;        /* Includes all the valid */
10777             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10778                             ? UTF8SKIP(RExC_parse)
10779                             : 1;
10780             /* Guard against malformed utf8 */
10781             if (RExC_parse >= endchar) {
10782                 RExC_parse = endchar;
10783             }
10784             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10785         }
10786
10787         if (in_char_class && has_multiple_chars) {
10788             if (strict) {
10789                 RExC_parse = endbrace;
10790                 vFAIL("\\N{} in character class restricted to one character");
10791             }
10792             else {
10793                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10794             }
10795         }
10796
10797         RExC_parse = endbrace + 1;
10798     }
10799     else if (! node_p || ! has_multiple_chars) {
10800
10801         /* Here, the input is legal, but not according to the caller's
10802          * options.  We fail without advancing the parse, so that the
10803          * caller can try again */
10804         RExC_parse = p;
10805         return FALSE;
10806     }
10807     else {
10808
10809         /* What is done here is to convert this to a sub-pattern of the form
10810          * (?:\x{char1}\x{char2}...)
10811          * and then call reg recursively.  That way, it retains its atomicness,
10812          * while not having to worry about special handling that some code
10813          * points may have.  toke.c has converted the original Unicode values
10814          * to native, so that we can just pass on the hex values unchanged.  We
10815          * do have to set a flag to keep recoding from happening in the
10816          * recursion */
10817
10818         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10819         STRLEN len;
10820         char *orig_end = RExC_end;
10821         I32 flags;
10822
10823         while (RExC_parse < endbrace) {
10824
10825             /* Convert to notation the rest of the code understands */
10826             sv_catpv(substitute_parse, "\\x{");
10827             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10828             sv_catpv(substitute_parse, "}");
10829
10830             /* Point to the beginning of the next character in the sequence. */
10831             RExC_parse = endchar + 1;
10832             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10833         }
10834         sv_catpv(substitute_parse, ")");
10835
10836         RExC_parse = SvPV(substitute_parse, len);
10837
10838         /* Don't allow empty number */
10839         if (len < 8) {
10840             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10841         }
10842         RExC_end = RExC_parse + len;
10843
10844         /* The values are Unicode, and therefore not subject to recoding */
10845         RExC_override_recoding = 1;
10846
10847         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10848             if (flags & RESTART_UTF8) {
10849                 *flagp = RESTART_UTF8;
10850                 return FALSE;
10851             }
10852             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10853                   (UV) flags);
10854         }
10855         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10856
10857         RExC_parse = endbrace;
10858         RExC_end = orig_end;
10859         RExC_override_recoding = 0;
10860
10861         nextchar(pRExC_state);
10862     }
10863
10864     return TRUE;
10865 }
10866
10867
10868 /*
10869  * reg_recode
10870  *
10871  * It returns the code point in utf8 for the value in *encp.
10872  *    value: a code value in the source encoding
10873  *    encp:  a pointer to an Encode object
10874  *
10875  * If the result from Encode is not a single character,
10876  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10877  */
10878 STATIC UV
10879 S_reg_recode(pTHX_ const char value, SV **encp)
10880 {
10881     STRLEN numlen = 1;
10882     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10883     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10884     const STRLEN newlen = SvCUR(sv);
10885     UV uv = UNICODE_REPLACEMENT;
10886
10887     PERL_ARGS_ASSERT_REG_RECODE;
10888
10889     if (newlen)
10890         uv = SvUTF8(sv)
10891              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10892              : *(U8*)s;
10893
10894     if (!newlen || numlen != newlen) {
10895         uv = UNICODE_REPLACEMENT;
10896         *encp = NULL;
10897     }
10898     return uv;
10899 }
10900
10901 PERL_STATIC_INLINE U8
10902 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10903 {
10904     U8 op;
10905
10906     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10907
10908     if (! FOLD) {
10909         return EXACT;
10910     }
10911
10912     op = get_regex_charset(RExC_flags);
10913     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10914         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10915                  been, so there is no hole */
10916     }
10917
10918     return op + EXACTF;
10919 }
10920
10921 PERL_STATIC_INLINE void
10922 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10923                          regnode *node, I32* flagp, STRLEN len, UV code_point,
10924                          bool downgradable)
10925 {
10926     /* This knows the details about sizing an EXACTish node, setting flags for
10927      * it (by setting <*flagp>, and potentially populating it with a single
10928      * character.
10929      *
10930      * If <len> (the length in bytes) is non-zero, this function assumes that
10931      * the node has already been populated, and just does the sizing.  In this
10932      * case <code_point> should be the final code point that has already been
10933      * placed into the node.  This value will be ignored except that under some
10934      * circumstances <*flagp> is set based on it.
10935      *
10936      * If <len> is zero, the function assumes that the node is to contain only
10937      * the single character given by <code_point> and calculates what <len>
10938      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10939      * additionally will populate the node's STRING with <code_point> or its
10940      * fold if folding.
10941      *
10942      * In both cases <*flagp> is appropriately set
10943      *
10944      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10945      * 255, must be folded (the former only when the rules indicate it can
10946      * match 'ss')
10947      *
10948      * When it does the populating, it looks at the flag 'downgradable'.  If
10949      * true with a node that folds, it checks if the single code point
10950      * participates in a fold, and if not downgrades the node to an EXACT.
10951      * This helps the optimizer */
10952
10953     bool len_passed_in = cBOOL(len != 0);
10954     U8 character[UTF8_MAXBYTES_CASE+1];
10955
10956     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10957
10958     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10959      * sizing difference, and is extra work that is thrown away */
10960     if (downgradable && ! PASS2) {
10961         downgradable = FALSE;
10962     }
10963
10964     if (! len_passed_in) {
10965         if (UTF) {
10966             if (UNI_IS_INVARIANT(code_point)) {
10967                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10968                     *character = (U8) code_point;
10969                 }
10970                 else { /* Here is /i and not /l (toFOLD() is defined on just
10971                           ASCII, which isn't the same thing as INVARIANT on
10972                           EBCDIC, but it works there, as the extra invariants
10973                           fold to themselves) */
10974                     *character = toFOLD((U8) code_point);
10975                     if (downgradable
10976                         && *character == code_point
10977                         && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
10978                     {
10979                         OP(node) = EXACT;
10980                     }
10981                 }
10982                 len = 1;
10983             }
10984             else if (FOLD && (! LOC
10985                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10986             {   /* Folding, and ok to do so now */
10987                 UV folded = _to_uni_fold_flags(
10988                                    code_point,
10989                                    character,
10990                                    &len,
10991                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
10992                                                       ? FOLD_FLAGS_NOMIX_ASCII
10993                                                       : 0));
10994                 if (downgradable
10995                     && folded == code_point
10996                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
10997                 {
10998                     OP(node) = EXACT;
10999                 }
11000             }
11001             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11002
11003                 /* Not folding this cp, and can output it directly */
11004                 *character = UTF8_TWO_BYTE_HI(code_point);
11005                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11006                 len = 2;
11007             }
11008             else {
11009                 uvchr_to_utf8( character, code_point);
11010                 len = UTF8SKIP(character);
11011             }
11012         } /* Else pattern isn't UTF8.  */
11013         else if (! FOLD) {
11014             *character = (U8) code_point;
11015             len = 1;
11016         } /* Else is folded non-UTF8 */
11017         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11018
11019             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11020              * comments at join_exact()); */
11021             *character = (U8) code_point;
11022             len = 1;
11023
11024             /* Can turn into an EXACT node if we know the fold at compile time,
11025              * and it folds to itself and doesn't particpate in other folds */
11026             if (downgradable
11027                 && ! LOC
11028                 && PL_fold_latin1[code_point] == code_point
11029                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11030                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11031             {
11032                 OP(node) = EXACT;
11033             }
11034         } /* else is Sharp s.  May need to fold it */
11035         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11036             *character = 's';
11037             *(character + 1) = 's';
11038             len = 2;
11039         }
11040         else {
11041             *character = LATIN_SMALL_LETTER_SHARP_S;
11042             len = 1;
11043         }
11044     }
11045
11046     if (SIZE_ONLY) {
11047         RExC_size += STR_SZ(len);
11048     }
11049     else {
11050         RExC_emit += STR_SZ(len);
11051         STR_LEN(node) = len;
11052         if (! len_passed_in) {
11053             Copy((char *) character, STRING(node), len, char);
11054         }
11055     }
11056
11057     *flagp |= HASWIDTH;
11058
11059     /* A single character node is SIMPLE, except for the special-cased SHARP S
11060      * under /di. */
11061     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11062         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11063             || ! FOLD || ! DEPENDS_SEMANTICS))
11064     {
11065         *flagp |= SIMPLE;
11066     }
11067
11068     /* The OP may not be well defined in PASS1 */
11069     if (PASS2 && OP(node) == EXACTFL) {
11070         RExC_contains_locale = 1;
11071     }
11072 }
11073
11074
11075 /* return atoi(p), unless it's too big to sensibly be a backref,
11076  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11077
11078 static I32
11079 S_backref_value(char *p)
11080 {
11081     char *q = p;
11082
11083     for (;isDIGIT(*q); q++); /* calculate length of num */
11084     if (q - p == 0 || q - p > 9)
11085         return I32_MAX;
11086     return atoi(p);
11087 }
11088
11089
11090 /*
11091  - regatom - the lowest level
11092
11093    Try to identify anything special at the start of the pattern. If there
11094    is, then handle it as required. This may involve generating a single regop,
11095    such as for an assertion; or it may involve recursing, such as to
11096    handle a () structure.
11097
11098    If the string doesn't start with something special then we gobble up
11099    as much literal text as we can.
11100
11101    Once we have been able to handle whatever type of thing started the
11102    sequence, we return.
11103
11104    Note: we have to be careful with escapes, as they can be both literal
11105    and special, and in the case of \10 and friends, context determines which.
11106
11107    A summary of the code structure is:
11108
11109    switch (first_byte) {
11110         cases for each special:
11111             handle this special;
11112             break;
11113         case '\\':
11114             switch (2nd byte) {
11115                 cases for each unambiguous special:
11116                     handle this special;
11117                     break;
11118                 cases for each ambigous special/literal:
11119                     disambiguate;
11120                     if (special)  handle here
11121                     else goto defchar;
11122                 default: // unambiguously literal:
11123                     goto defchar;
11124             }
11125         default:  // is a literal char
11126             // FALL THROUGH
11127         defchar:
11128             create EXACTish node for literal;
11129             while (more input and node isn't full) {
11130                 switch (input_byte) {
11131                    cases for each special;
11132                        make sure parse pointer is set so that the next call to
11133                            regatom will see this special first
11134                        goto loopdone; // EXACTish node terminated by prev. char
11135                    default:
11136                        append char to EXACTISH node;
11137                 }
11138                 get next input byte;
11139             }
11140         loopdone:
11141    }
11142    return the generated node;
11143
11144    Specifically there are two separate switches for handling
11145    escape sequences, with the one for handling literal escapes requiring
11146    a dummy entry for all of the special escapes that are actually handled
11147    by the other.
11148
11149    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11150    TRYAGAIN.
11151    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11152    restarted.
11153    Otherwise does not return NULL.
11154 */
11155
11156 STATIC regnode *
11157 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11158 {
11159     dVAR;
11160     regnode *ret = NULL;
11161     I32 flags = 0;
11162     char *parse_start = RExC_parse;
11163     U8 op;
11164     int invert = 0;
11165
11166     GET_RE_DEBUG_FLAGS_DECL;
11167
11168     *flagp = WORST;             /* Tentatively. */
11169
11170     DEBUG_PARSE("atom");
11171
11172     PERL_ARGS_ASSERT_REGATOM;
11173
11174 tryagain:
11175     switch ((U8)*RExC_parse) {
11176     case '^':
11177         RExC_seen_zerolen++;
11178         nextchar(pRExC_state);
11179         if (RExC_flags & RXf_PMf_MULTILINE)
11180             ret = reg_node(pRExC_state, MBOL);
11181         else if (RExC_flags & RXf_PMf_SINGLELINE)
11182             ret = reg_node(pRExC_state, SBOL);
11183         else
11184             ret = reg_node(pRExC_state, BOL);
11185         Set_Node_Length(ret, 1); /* MJD */
11186         break;
11187     case '$':
11188         nextchar(pRExC_state);
11189         if (*RExC_parse)
11190             RExC_seen_zerolen++;
11191         if (RExC_flags & RXf_PMf_MULTILINE)
11192             ret = reg_node(pRExC_state, MEOL);
11193         else if (RExC_flags & RXf_PMf_SINGLELINE)
11194             ret = reg_node(pRExC_state, SEOL);
11195         else
11196             ret = reg_node(pRExC_state, EOL);
11197         Set_Node_Length(ret, 1); /* MJD */
11198         break;
11199     case '.':
11200         nextchar(pRExC_state);
11201         if (RExC_flags & RXf_PMf_SINGLELINE)
11202             ret = reg_node(pRExC_state, SANY);
11203         else
11204             ret = reg_node(pRExC_state, REG_ANY);
11205         *flagp |= HASWIDTH|SIMPLE;
11206         RExC_naughty++;
11207         Set_Node_Length(ret, 1); /* MJD */
11208         break;
11209     case '[':
11210     {
11211         char * const oregcomp_parse = ++RExC_parse;
11212         ret = regclass(pRExC_state, flagp,depth+1,
11213                        FALSE, /* means parse the whole char class */
11214                        TRUE, /* allow multi-char folds */
11215                        FALSE, /* don't silence non-portable warnings. */
11216                        NULL);
11217         if (*RExC_parse != ']') {
11218             RExC_parse = oregcomp_parse;
11219             vFAIL("Unmatched [");
11220         }
11221         if (ret == NULL) {
11222             if (*flagp & RESTART_UTF8)
11223                 return NULL;
11224             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11225                   (UV) *flagp);
11226         }
11227         nextchar(pRExC_state);
11228         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11229         break;
11230     }
11231     case '(':
11232         nextchar(pRExC_state);
11233         ret = reg(pRExC_state, 2, &flags,depth+1);
11234         if (ret == NULL) {
11235                 if (flags & TRYAGAIN) {
11236                     if (RExC_parse == RExC_end) {
11237                          /* Make parent create an empty node if needed. */
11238                         *flagp |= TRYAGAIN;
11239                         return(NULL);
11240                     }
11241                     goto tryagain;
11242                 }
11243                 if (flags & RESTART_UTF8) {
11244                     *flagp = RESTART_UTF8;
11245                     return NULL;
11246                 }
11247                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11248                                                                  (UV) flags);
11249         }
11250         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11251         break;
11252     case '|':
11253     case ')':
11254         if (flags & TRYAGAIN) {
11255             *flagp |= TRYAGAIN;
11256             return NULL;
11257         }
11258         vFAIL("Internal urp");
11259                                 /* Supposed to be caught earlier. */
11260         break;
11261     case '{':
11262         if (!regcurly(RExC_parse, FALSE)) {
11263             RExC_parse++;
11264             goto defchar;
11265         }
11266         /* FALL THROUGH */
11267     case '?':
11268     case '+':
11269     case '*':
11270         RExC_parse++;
11271         vFAIL("Quantifier follows nothing");
11272         break;
11273     case '\\':
11274         /* Special Escapes
11275
11276            This switch handles escape sequences that resolve to some kind
11277            of special regop and not to literal text. Escape sequnces that
11278            resolve to literal text are handled below in the switch marked
11279            "Literal Escapes".
11280
11281            Every entry in this switch *must* have a corresponding entry
11282            in the literal escape switch. However, the opposite is not
11283            required, as the default for this switch is to jump to the
11284            literal text handling code.
11285         */
11286         switch ((U8)*++RExC_parse) {
11287             U8 arg;
11288         /* Special Escapes */
11289         case 'A':
11290             RExC_seen_zerolen++;
11291             ret = reg_node(pRExC_state, SBOL);
11292             *flagp |= SIMPLE;
11293             goto finish_meta_pat;
11294         case 'G':
11295             ret = reg_node(pRExC_state, GPOS);
11296             RExC_seen |= REG_GPOS_SEEN;
11297             *flagp |= SIMPLE;
11298             goto finish_meta_pat;
11299         case 'K':
11300             RExC_seen_zerolen++;
11301             ret = reg_node(pRExC_state, KEEPS);
11302             *flagp |= SIMPLE;
11303             /* XXX:dmq : disabling in-place substitution seems to
11304              * be necessary here to avoid cases of memory corruption, as
11305              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11306              */
11307             RExC_seen |= REG_LOOKBEHIND_SEEN;
11308             goto finish_meta_pat;
11309         case 'Z':
11310             ret = reg_node(pRExC_state, SEOL);
11311             *flagp |= SIMPLE;
11312             RExC_seen_zerolen++;                /* Do not optimize RE away */
11313             goto finish_meta_pat;
11314         case 'z':
11315             ret = reg_node(pRExC_state, EOS);
11316             *flagp |= SIMPLE;
11317             RExC_seen_zerolen++;                /* Do not optimize RE away */
11318             goto finish_meta_pat;
11319         case 'C':
11320             ret = reg_node(pRExC_state, CANY);
11321             RExC_seen |= REG_CANY_SEEN;
11322             *flagp |= HASWIDTH|SIMPLE;
11323             goto finish_meta_pat;
11324         case 'X':
11325             ret = reg_node(pRExC_state, CLUMP);
11326             *flagp |= HASWIDTH;
11327             goto finish_meta_pat;
11328
11329         case 'W':
11330             invert = 1;
11331             /* FALLTHROUGH */
11332         case 'w':
11333             arg = ANYOF_WORDCHAR;
11334             goto join_posix;
11335
11336         case 'b':
11337             RExC_seen_zerolen++;
11338             RExC_seen |= REG_LOOKBEHIND_SEEN;
11339             op = BOUND + get_regex_charset(RExC_flags);
11340             if (op > BOUNDA) {  /* /aa is same as /a */
11341                 op = BOUNDA;
11342             }
11343             else if (op == BOUNDL) {
11344                 RExC_contains_locale = 1;
11345             }
11346             ret = reg_node(pRExC_state, op);
11347             FLAGS(ret) = get_regex_charset(RExC_flags);
11348             *flagp |= SIMPLE;
11349             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11350                 /* diag_listed_as: Use "%s" instead of "%s" */
11351                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11352             }
11353             goto finish_meta_pat;
11354         case 'B':
11355             RExC_seen_zerolen++;
11356             RExC_seen |= REG_LOOKBEHIND_SEEN;
11357             op = NBOUND + get_regex_charset(RExC_flags);
11358             if (op > NBOUNDA) { /* /aa is same as /a */
11359                 op = NBOUNDA;
11360             }
11361             else if (op == NBOUNDL) {
11362                 RExC_contains_locale = 1;
11363             }
11364             ret = reg_node(pRExC_state, op);
11365             FLAGS(ret) = get_regex_charset(RExC_flags);
11366             *flagp |= SIMPLE;
11367             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11368                 /* diag_listed_as: Use "%s" instead of "%s" */
11369                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11370             }
11371             goto finish_meta_pat;
11372
11373         case 'D':
11374             invert = 1;
11375             /* FALLTHROUGH */
11376         case 'd':
11377             arg = ANYOF_DIGIT;
11378             goto join_posix;
11379
11380         case 'R':
11381             ret = reg_node(pRExC_state, LNBREAK);
11382             *flagp |= HASWIDTH|SIMPLE;
11383             goto finish_meta_pat;
11384
11385         case 'H':
11386             invert = 1;
11387             /* FALLTHROUGH */
11388         case 'h':
11389             arg = ANYOF_BLANK;
11390             op = POSIXU;
11391             goto join_posix_op_known;
11392
11393         case 'V':
11394             invert = 1;
11395             /* FALLTHROUGH */
11396         case 'v':
11397             arg = ANYOF_VERTWS;
11398             op = POSIXU;
11399             goto join_posix_op_known;
11400
11401         case 'S':
11402             invert = 1;
11403             /* FALLTHROUGH */
11404         case 's':
11405             arg = ANYOF_SPACE;
11406
11407         join_posix:
11408
11409             op = POSIXD + get_regex_charset(RExC_flags);
11410             if (op > POSIXA) {  /* /aa is same as /a */
11411                 op = POSIXA;
11412             }
11413             else if (op == POSIXL) {
11414                 RExC_contains_locale = 1;
11415             }
11416
11417         join_posix_op_known:
11418
11419             if (invert) {
11420                 op += NPOSIXD - POSIXD;
11421             }
11422
11423             ret = reg_node(pRExC_state, op);
11424             if (! SIZE_ONLY) {
11425                 FLAGS(ret) = namedclass_to_classnum(arg);
11426             }
11427
11428             *flagp |= HASWIDTH|SIMPLE;
11429             /* FALL THROUGH */
11430
11431          finish_meta_pat:
11432             nextchar(pRExC_state);
11433             Set_Node_Length(ret, 2); /* MJD */
11434             break;
11435         case 'p':
11436         case 'P':
11437             {
11438 #ifdef DEBUGGING
11439                 char* parse_start = RExC_parse - 2;
11440 #endif
11441
11442                 RExC_parse--;
11443
11444                 ret = regclass(pRExC_state, flagp,depth+1,
11445                                TRUE, /* means just parse this element */
11446                                FALSE, /* don't allow multi-char folds */
11447                                FALSE, /* don't silence non-portable warnings.
11448                                          It would be a bug if these returned
11449                                          non-portables */
11450                                NULL);
11451                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11452                    are allowed.  */
11453                 if (!ret)
11454                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11455                           (UV) *flagp);
11456
11457                 RExC_parse--;
11458
11459                 Set_Node_Offset(ret, parse_start + 2);
11460                 Set_Node_Cur_Length(ret, parse_start);
11461                 nextchar(pRExC_state);
11462             }
11463             break;
11464         case 'N':
11465             /* Handle \N and \N{NAME} with multiple code points here and not
11466              * below because it can be multicharacter. join_exact() will join
11467              * them up later on.  Also this makes sure that things like
11468              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11469              * The options to the grok function call causes it to fail if the
11470              * sequence is just a single code point.  We then go treat it as
11471              * just another character in the current EXACT node, and hence it
11472              * gets uniform treatment with all the other characters.  The
11473              * special treatment for quantifiers is not needed for such single
11474              * character sequences */
11475             ++RExC_parse;
11476             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11477                                 FALSE /* not strict */ )) {
11478                 if (*flagp & RESTART_UTF8)
11479                     return NULL;
11480                 RExC_parse--;
11481                 goto defchar;
11482             }
11483             break;
11484         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11485         parse_named_seq:
11486         {
11487             char ch= RExC_parse[1];
11488             if (ch != '<' && ch != '\'' && ch != '{') {
11489                 RExC_parse++;
11490                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11491                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11492             } else {
11493                 /* this pretty much dupes the code for (?P=...) in reg(), if
11494                    you change this make sure you change that */
11495                 char* name_start = (RExC_parse += 2);
11496                 U32 num = 0;
11497                 SV *sv_dat = reg_scan_name(pRExC_state,
11498                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11499                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11500                 if (RExC_parse == name_start || *RExC_parse != ch)
11501                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11502                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11503
11504                 if (!SIZE_ONLY) {
11505                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11506                     RExC_rxi->data->data[num]=(void*)sv_dat;
11507                     SvREFCNT_inc_simple_void(sv_dat);
11508                 }
11509
11510                 RExC_sawback = 1;
11511                 ret = reganode(pRExC_state,
11512                                ((! FOLD)
11513                                  ? NREF
11514                                  : (ASCII_FOLD_RESTRICTED)
11515                                    ? NREFFA
11516                                    : (AT_LEAST_UNI_SEMANTICS)
11517                                      ? NREFFU
11518                                      : (LOC)
11519                                        ? NREFFL
11520                                        : NREFF),
11521                                 num);
11522                 *flagp |= HASWIDTH;
11523
11524                 /* override incorrect value set in reganode MJD */
11525                 Set_Node_Offset(ret, parse_start+1);
11526                 Set_Node_Cur_Length(ret, parse_start);
11527                 nextchar(pRExC_state);
11528
11529             }
11530             break;
11531         }
11532         case 'g':
11533         case '1': case '2': case '3': case '4':
11534         case '5': case '6': case '7': case '8': case '9':
11535             {
11536                 I32 num;
11537                 bool hasbrace = 0;
11538
11539                 if (*RExC_parse == 'g') {
11540                     bool isrel = 0;
11541
11542                     RExC_parse++;
11543                     if (*RExC_parse == '{') {
11544                         RExC_parse++;
11545                         hasbrace = 1;
11546                     }
11547                     if (*RExC_parse == '-') {
11548                         RExC_parse++;
11549                         isrel = 1;
11550                     }
11551                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11552                         if (isrel) RExC_parse--;
11553                         RExC_parse -= 2;
11554                         goto parse_named_seq;
11555                     }
11556
11557                     num = S_backref_value(RExC_parse);
11558                     if (num == 0)
11559                         vFAIL("Reference to invalid group 0");
11560                     else if (num == I32_MAX) {
11561                          if (isDIGIT(*RExC_parse))
11562                             vFAIL("Reference to nonexistent group");
11563                         else
11564                             vFAIL("Unterminated \\g... pattern");
11565                     }
11566
11567                     if (isrel) {
11568                         num = RExC_npar - num;
11569                         if (num < 1)
11570                             vFAIL("Reference to nonexistent or unclosed group");
11571                     }
11572                 }
11573                 else {
11574                     num = S_backref_value(RExC_parse);
11575                     /* bare \NNN might be backref or octal - if it is larger than or equal
11576                      * RExC_npar then it is assumed to be and octal escape.
11577                      * Note RExC_npar is +1 from the actual number of parens*/
11578                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11579                             && *RExC_parse != '8' && *RExC_parse != '9'))
11580                     {
11581                         /* Probably a character specified in octal, e.g. \35 */
11582                         goto defchar;
11583                     }
11584                 }
11585
11586                 /* at this point RExC_parse definitely points to a backref
11587                  * number */
11588                 {
11589 #ifdef RE_TRACK_PATTERN_OFFSETS
11590                     char * const parse_start = RExC_parse - 1; /* MJD */
11591 #endif
11592                     while (isDIGIT(*RExC_parse))
11593                         RExC_parse++;
11594                     if (hasbrace) {
11595                         if (*RExC_parse != '}')
11596                             vFAIL("Unterminated \\g{...} pattern");
11597                         RExC_parse++;
11598                     }
11599                     if (!SIZE_ONLY) {
11600                         if (num > (I32)RExC_rx->nparens)
11601                             vFAIL("Reference to nonexistent group");
11602                     }
11603                     RExC_sawback = 1;
11604                     ret = reganode(pRExC_state,
11605                                    ((! FOLD)
11606                                      ? REF
11607                                      : (ASCII_FOLD_RESTRICTED)
11608                                        ? REFFA
11609                                        : (AT_LEAST_UNI_SEMANTICS)
11610                                          ? REFFU
11611                                          : (LOC)
11612                                            ? REFFL
11613                                            : REFF),
11614                                     num);
11615                     *flagp |= HASWIDTH;
11616
11617                     /* override incorrect value set in reganode MJD */
11618                     Set_Node_Offset(ret, parse_start+1);
11619                     Set_Node_Cur_Length(ret, parse_start);
11620                     RExC_parse--;
11621                     nextchar(pRExC_state);
11622                 }
11623             }
11624             break;
11625         case '\0':
11626             if (RExC_parse >= RExC_end)
11627                 FAIL("Trailing \\");
11628             /* FALL THROUGH */
11629         default:
11630             /* Do not generate "unrecognized" warnings here, we fall
11631                back into the quick-grab loop below */
11632             parse_start--;
11633             goto defchar;
11634         }
11635         break;
11636
11637     case '#':
11638         if (RExC_flags & RXf_PMf_EXTENDED) {
11639             if ( reg_skipcomment( pRExC_state ) )
11640                 goto tryagain;
11641         }
11642         /* FALL THROUGH */
11643
11644     default:
11645
11646             parse_start = RExC_parse - 1;
11647
11648             RExC_parse++;
11649
11650         defchar: {
11651             STRLEN len = 0;
11652             UV ender = 0;
11653             char *p;
11654             char *s;
11655 #define MAX_NODE_STRING_SIZE 127
11656             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11657             char *s0;
11658             U8 upper_parse = MAX_NODE_STRING_SIZE;
11659             U8 node_type = compute_EXACTish(pRExC_state);
11660             bool next_is_quantifier;
11661             char * oldp = NULL;
11662
11663             /* We can convert EXACTF nodes to EXACTFU if they contain only
11664              * characters that match identically regardless of the target
11665              * string's UTF8ness.  The reason to do this is that EXACTF is not
11666              * trie-able, EXACTFU is.
11667              *
11668              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11669              * contain only above-Latin1 characters (hence must be in UTF8),
11670              * which don't participate in folds with Latin1-range characters,
11671              * as the latter's folds aren't known until runtime.  (We don't
11672              * need to figure this out until pass 2) */
11673             bool maybe_exactfu = PASS2
11674                                && (node_type == EXACTF || node_type == EXACTFL);
11675
11676             /* If a folding node contains only code points that don't
11677              * participate in folds, it can be changed into an EXACT node,
11678              * which allows the optimizer more things to look for */
11679             bool maybe_exact;
11680
11681             ret = reg_node(pRExC_state, node_type);
11682
11683             /* In pass1, folded, we use a temporary buffer instead of the
11684              * actual node, as the node doesn't exist yet */
11685             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11686
11687             s0 = s;
11688
11689         reparse:
11690
11691             /* We do the EXACTFish to EXACT node only if folding.  (And we
11692              * don't need to figure this out until pass 2) */
11693             maybe_exact = FOLD && PASS2;
11694
11695             /* XXX The node can hold up to 255 bytes, yet this only goes to
11696              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11697              * 255 allows us to not have to worry about overflow due to
11698              * converting to utf8 and fold expansion, but that value is
11699              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11700              * split up by this limit into a single one using the real max of
11701              * 255.  Even at 127, this breaks under rare circumstances.  If
11702              * folding, we do not want to split a node at a character that is a
11703              * non-final in a multi-char fold, as an input string could just
11704              * happen to want to match across the node boundary.  The join
11705              * would solve that problem if the join actually happens.  But a
11706              * series of more than two nodes in a row each of 127 would cause
11707              * the first join to succeed to get to 254, but then there wouldn't
11708              * be room for the next one, which could at be one of those split
11709              * multi-char folds.  I don't know of any fool-proof solution.  One
11710              * could back off to end with only a code point that isn't such a
11711              * non-final, but it is possible for there not to be any in the
11712              * entire node. */
11713             for (p = RExC_parse - 1;
11714                  len < upper_parse && p < RExC_end;
11715                  len++)
11716             {
11717                 oldp = p;
11718
11719                 if (RExC_flags & RXf_PMf_EXTENDED)
11720                     p = regwhite( pRExC_state, p );
11721                 switch ((U8)*p) {
11722                 case '^':
11723                 case '$':
11724                 case '.':
11725                 case '[':
11726                 case '(':
11727                 case ')':
11728                 case '|':
11729                     goto loopdone;
11730                 case '\\':
11731                     /* Literal Escapes Switch
11732
11733                        This switch is meant to handle escape sequences that
11734                        resolve to a literal character.
11735
11736                        Every escape sequence that represents something
11737                        else, like an assertion or a char class, is handled
11738                        in the switch marked 'Special Escapes' above in this
11739                        routine, but also has an entry here as anything that
11740                        isn't explicitly mentioned here will be treated as
11741                        an unescaped equivalent literal.
11742                     */
11743
11744                     switch ((U8)*++p) {
11745                     /* These are all the special escapes. */
11746                     case 'A':             /* Start assertion */
11747                     case 'b': case 'B':   /* Word-boundary assertion*/
11748                     case 'C':             /* Single char !DANGEROUS! */
11749                     case 'd': case 'D':   /* digit class */
11750                     case 'g': case 'G':   /* generic-backref, pos assertion */
11751                     case 'h': case 'H':   /* HORIZWS */
11752                     case 'k': case 'K':   /* named backref, keep marker */
11753                     case 'p': case 'P':   /* Unicode property */
11754                               case 'R':   /* LNBREAK */
11755                     case 's': case 'S':   /* space class */
11756                     case 'v': case 'V':   /* VERTWS */
11757                     case 'w': case 'W':   /* word class */
11758                     case 'X':             /* eXtended Unicode "combining
11759                                              character sequence" */
11760                     case 'z': case 'Z':   /* End of line/string assertion */
11761                         --p;
11762                         goto loopdone;
11763
11764                     /* Anything after here is an escape that resolves to a
11765                        literal. (Except digits, which may or may not)
11766                      */
11767                     case 'n':
11768                         ender = '\n';
11769                         p++;
11770                         break;
11771                     case 'N': /* Handle a single-code point named character. */
11772                         /* The options cause it to fail if a multiple code
11773                          * point sequence.  Handle those in the switch() above
11774                          * */
11775                         RExC_parse = p + 1;
11776                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11777                                             flagp, depth, FALSE,
11778                                             FALSE /* not strict */ ))
11779                         {
11780                             if (*flagp & RESTART_UTF8)
11781                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11782                             RExC_parse = p = oldp;
11783                             goto loopdone;
11784                         }
11785                         p = RExC_parse;
11786                         if (ender > 0xff) {
11787                             REQUIRE_UTF8;
11788                         }
11789                         break;
11790                     case 'r':
11791                         ender = '\r';
11792                         p++;
11793                         break;
11794                     case 't':
11795                         ender = '\t';
11796                         p++;
11797                         break;
11798                     case 'f':
11799                         ender = '\f';
11800                         p++;
11801                         break;
11802                     case 'e':
11803                           ender = ASCII_TO_NATIVE('\033');
11804                         p++;
11805                         break;
11806                     case 'a':
11807                           ender = '\a';
11808                         p++;
11809                         break;
11810                     case 'o':
11811                         {
11812                             UV result;
11813                             const char* error_msg;
11814
11815                             bool valid = grok_bslash_o(&p,
11816                                                        &result,
11817                                                        &error_msg,
11818                                                        TRUE, /* out warnings */
11819                                                        FALSE, /* not strict */
11820                                                        TRUE, /* Output warnings
11821                                                                 for non-
11822                                                                 portables */
11823                                                        UTF);
11824                             if (! valid) {
11825                                 RExC_parse = p; /* going to die anyway; point
11826                                                    to exact spot of failure */
11827                                 vFAIL(error_msg);
11828                             }
11829                             ender = result;
11830                             if (PL_encoding && ender < 0x100) {
11831                                 goto recode_encoding;
11832                             }
11833                             if (ender > 0xff) {
11834                                 REQUIRE_UTF8;
11835                             }
11836                             break;
11837                         }
11838                     case 'x':
11839                         {
11840                             UV result = UV_MAX; /* initialize to erroneous
11841                                                    value */
11842                             const char* error_msg;
11843
11844                             bool valid = grok_bslash_x(&p,
11845                                                        &result,
11846                                                        &error_msg,
11847                                                        TRUE, /* out warnings */
11848                                                        FALSE, /* not strict */
11849                                                        TRUE, /* Output warnings
11850                                                                 for non-
11851                                                                 portables */
11852                                                        UTF);
11853                             if (! valid) {
11854                                 RExC_parse = p; /* going to die anyway; point
11855                                                    to exact spot of failure */
11856                                 vFAIL(error_msg);
11857                             }
11858                             ender = result;
11859
11860                             if (PL_encoding && ender < 0x100) {
11861                                 goto recode_encoding;
11862                             }
11863                             if (ender > 0xff) {
11864                                 REQUIRE_UTF8;
11865                             }
11866                             break;
11867                         }
11868                     case 'c':
11869                         p++;
11870                         ender = grok_bslash_c(*p++, SIZE_ONLY);
11871                         break;
11872                     case '8': case '9': /* must be a backreference */
11873                         --p;
11874                         goto loopdone;
11875                     case '1': case '2': case '3':case '4':
11876                     case '5': case '6': case '7':
11877                         /* When we parse backslash escapes there is ambiguity
11878                          * between backreferences and octal escapes. Any escape
11879                          * from \1 - \9 is a backreference, any multi-digit
11880                          * escape which does not start with 0 and which when
11881                          * evaluated as decimal could refer to an already
11882                          * parsed capture buffer is a backslash. Anything else
11883                          * is octal.
11884                          *
11885                          * Note this implies that \118 could be interpreted as
11886                          * 118 OR as "\11" . "8" depending on whether there
11887                          * were 118 capture buffers defined already in the
11888                          * pattern.  */
11889
11890                         /* NOTE, RExC_npar is 1 more than the actual number of
11891                          * parens we have seen so far, hence the < RExC_npar below. */
11892
11893                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11894                         {  /* Not to be treated as an octal constant, go
11895                                    find backref */
11896                             --p;
11897                             goto loopdone;
11898                         }
11899                     case '0':
11900                         {
11901                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11902                             STRLEN numlen = 3;
11903                             ender = grok_oct(p, &numlen, &flags, NULL);
11904                             if (ender > 0xff) {
11905                                 REQUIRE_UTF8;
11906                             }
11907                             p += numlen;
11908                             if (SIZE_ONLY   /* like \08, \178 */
11909                                 && numlen < 3
11910                                 && p < RExC_end
11911                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11912                             {
11913                                 reg_warn_non_literal_string(
11914                                          p + 1,
11915                                          form_short_octal_warning(p, numlen));
11916                             }
11917                         }
11918                         if (PL_encoding && ender < 0x100)
11919                             goto recode_encoding;
11920                         break;
11921                     recode_encoding:
11922                         if (! RExC_override_recoding) {
11923                             SV* enc = PL_encoding;
11924                             ender = reg_recode((const char)(U8)ender, &enc);
11925                             if (!enc && SIZE_ONLY)
11926                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11927                             REQUIRE_UTF8;
11928                         }
11929                         break;
11930                     case '\0':
11931                         if (p >= RExC_end)
11932                             FAIL("Trailing \\");
11933                         /* FALL THROUGH */
11934                     default:
11935                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11936                             /* Include any { following the alpha to emphasize
11937                              * that it could be part of an escape at some point
11938                              * in the future */
11939                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11940                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11941                         }
11942                         goto normal_default;
11943                     } /* End of switch on '\' */
11944                     break;
11945                 default:    /* A literal character */
11946
11947                     if (! SIZE_ONLY
11948                         && RExC_flags & RXf_PMf_EXTENDED
11949                         && ckWARN_d(WARN_DEPRECATED)
11950                         && is_PATWS_non_low_safe(p, RExC_end, UTF))
11951                     {
11952                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11953                                 "Escape literal pattern white space under /x");
11954                     }
11955
11956                   normal_default:
11957                     if (UTF8_IS_START(*p) && UTF) {
11958                         STRLEN numlen;
11959                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11960                                                &numlen, UTF8_ALLOW_DEFAULT);
11961                         p += numlen;
11962                     }
11963                     else
11964                         ender = (U8) *p++;
11965                     break;
11966                 } /* End of switch on the literal */
11967
11968                 /* Here, have looked at the literal character and <ender>
11969                  * contains its ordinal, <p> points to the character after it
11970                  */
11971
11972                 if ( RExC_flags & RXf_PMf_EXTENDED)
11973                     p = regwhite( pRExC_state, p );
11974
11975                 /* If the next thing is a quantifier, it applies to this
11976                  * character only, which means that this character has to be in
11977                  * its own node and can't just be appended to the string in an
11978                  * existing node, so if there are already other characters in
11979                  * the node, close the node with just them, and set up to do
11980                  * this character again next time through, when it will be the
11981                  * only thing in its new node */
11982                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11983                 {
11984                     p = oldp;
11985                     goto loopdone;
11986                 }
11987
11988                 if (! FOLD   /* The simple case, just append the literal */
11989                     || (LOC  /* Also don't fold for tricky chars under /l */
11990                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
11991                 {
11992                     if (UTF) {
11993                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11994                         if (unilen > 0) {
11995                            s   += unilen;
11996                            len += unilen;
11997                         }
11998
11999                         /* The loop increments <len> each time, as all but this
12000                          * path (and one other) through it add a single byte to
12001                          * the EXACTish node.  But this one has changed len to
12002                          * be the correct final value, so subtract one to
12003                          * cancel out the increment that follows */
12004                         len--;
12005                     }
12006                     else {
12007                         REGC((char)ender, s++);
12008                     }
12009
12010                     /* Can get here if folding only if is one of the /l
12011                      * characters whose fold depends on the locale.  The
12012                      * occurrence of any of these indicate that we can't
12013                      * simplify things */
12014                     if (FOLD) {
12015                         maybe_exact = FALSE;
12016                         maybe_exactfu = FALSE;
12017                     }
12018                 }
12019                 else             /* FOLD */
12020                      if (! ( UTF
12021                         /* See comments for join_exact() as to why we fold this
12022                          * non-UTF at compile time */
12023                         || (node_type == EXACTFU
12024                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12025                 {
12026                     /* Here, are folding and are not UTF-8 encoded; therefore
12027                      * the character must be in the range 0-255, and is not /l
12028                      * (Not /l because we already handled these under /l in
12029                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12030                     if (IS_IN_SOME_FOLD_L1(ender)) {
12031                         maybe_exact = FALSE;
12032
12033                         /* See if the character's fold differs between /d and
12034                          * /u.  This includes the multi-char fold SHARP S to
12035                          * 'ss' */
12036                         if (maybe_exactfu
12037                             && (PL_fold[ender] != PL_fold_latin1[ender]
12038                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12039                                 || (len > 0
12040                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12041                                    && isARG2_lower_or_UPPER_ARG1('s',
12042                                                                  *(s-1)))))
12043                         {
12044                             maybe_exactfu = FALSE;
12045                         }
12046                     }
12047
12048                     /* Even when folding, we store just the input character, as
12049                      * we have an array that finds its fold quickly */
12050                     *(s++) = (char) ender;
12051                 }
12052                 else {  /* FOLD and UTF */
12053                     /* Unlike the non-fold case, we do actually have to
12054                      * calculate the results here in pass 1.  This is for two
12055                      * reasons, the folded length may be longer than the
12056                      * unfolded, and we have to calculate how many EXACTish
12057                      * nodes it will take; and we may run out of room in a node
12058                      * in the middle of a potential multi-char fold, and have
12059                      * to back off accordingly.  (Hence we can't use REGC for
12060                      * the simple case just below.) */
12061
12062                     UV folded;
12063                     if (isASCII(ender)) {
12064                         folded = toFOLD(ender);
12065                         *(s)++ = (U8) folded;
12066                     }
12067                     else {
12068                         STRLEN foldlen;
12069
12070                         folded = _to_uni_fold_flags(
12071                                      ender,
12072                                      (U8 *) s,
12073                                      &foldlen,
12074                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12075                                                         ? FOLD_FLAGS_NOMIX_ASCII
12076                                                         : 0));
12077                         s += foldlen;
12078
12079                         /* The loop increments <len> each time, as all but this
12080                          * path (and one other) through it add a single byte to
12081                          * the EXACTish node.  But this one has changed len to
12082                          * be the correct final value, so subtract one to
12083                          * cancel out the increment that follows */
12084                         len += foldlen - 1;
12085                     }
12086                     /* If this node only contains non-folding code points so
12087                      * far, see if this new one is also non-folding */
12088                     if (maybe_exact) {
12089                         if (folded != ender) {
12090                             maybe_exact = FALSE;
12091                         }
12092                         else {
12093                             /* Here the fold is the original; we have to check
12094                              * further to see if anything folds to it */
12095                             if (_invlist_contains_cp(PL_utf8_foldable,
12096                                                         ender))
12097                             {
12098                                 maybe_exact = FALSE;
12099                             }
12100                         }
12101                     }
12102                     ender = folded;
12103                 }
12104
12105                 if (next_is_quantifier) {
12106
12107                     /* Here, the next input is a quantifier, and to get here,
12108                      * the current character is the only one in the node.
12109                      * Also, here <len> doesn't include the final byte for this
12110                      * character */
12111                     len++;
12112                     goto loopdone;
12113                 }
12114
12115             } /* End of loop through literal characters */
12116
12117             /* Here we have either exhausted the input or ran out of room in
12118              * the node.  (If we encountered a character that can't be in the
12119              * node, transfer is made directly to <loopdone>, and so we
12120              * wouldn't have fallen off the end of the loop.)  In the latter
12121              * case, we artificially have to split the node into two, because
12122              * we just don't have enough space to hold everything.  This
12123              * creates a problem if the final character participates in a
12124              * multi-character fold in the non-final position, as a match that
12125              * should have occurred won't, due to the way nodes are matched,
12126              * and our artificial boundary.  So back off until we find a non-
12127              * problematic character -- one that isn't at the beginning or
12128              * middle of such a fold.  (Either it doesn't participate in any
12129              * folds, or appears only in the final position of all the folds it
12130              * does participate in.)  A better solution with far fewer false
12131              * positives, and that would fill the nodes more completely, would
12132              * be to actually have available all the multi-character folds to
12133              * test against, and to back-off only far enough to be sure that
12134              * this node isn't ending with a partial one.  <upper_parse> is set
12135              * further below (if we need to reparse the node) to include just
12136              * up through that final non-problematic character that this code
12137              * identifies, so when it is set to less than the full node, we can
12138              * skip the rest of this */
12139             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12140
12141                 const STRLEN full_len = len;
12142
12143                 assert(len >= MAX_NODE_STRING_SIZE);
12144
12145                 /* Here, <s> points to the final byte of the final character.
12146                  * Look backwards through the string until find a non-
12147                  * problematic character */
12148
12149                 if (! UTF) {
12150
12151                     /* This has no multi-char folds to non-UTF characters */
12152                     if (ASCII_FOLD_RESTRICTED) {
12153                         goto loopdone;
12154                     }
12155
12156                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12157                     len = s - s0 + 1;
12158                 }
12159                 else {
12160                     if (!  PL_NonL1NonFinalFold) {
12161                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12162                                         NonL1_Perl_Non_Final_Folds_invlist);
12163                     }
12164
12165                     /* Point to the first byte of the final character */
12166                     s = (char *) utf8_hop((U8 *) s, -1);
12167
12168                     while (s >= s0) {   /* Search backwards until find
12169                                            non-problematic char */
12170                         if (UTF8_IS_INVARIANT(*s)) {
12171
12172                             /* There are no ascii characters that participate
12173                              * in multi-char folds under /aa.  In EBCDIC, the
12174                              * non-ascii invariants are all control characters,
12175                              * so don't ever participate in any folds. */
12176                             if (ASCII_FOLD_RESTRICTED
12177                                 || ! IS_NON_FINAL_FOLD(*s))
12178                             {
12179                                 break;
12180                             }
12181                         }
12182                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12183                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12184                                                                   *s, *(s+1))))
12185                             {
12186                                 break;
12187                             }
12188                         }
12189                         else if (! _invlist_contains_cp(
12190                                         PL_NonL1NonFinalFold,
12191                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12192                         {
12193                             break;
12194                         }
12195
12196                         /* Here, the current character is problematic in that
12197                          * it does occur in the non-final position of some
12198                          * fold, so try the character before it, but have to
12199                          * special case the very first byte in the string, so
12200                          * we don't read outside the string */
12201                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12202                     } /* End of loop backwards through the string */
12203
12204                     /* If there were only problematic characters in the string,
12205                      * <s> will point to before s0, in which case the length
12206                      * should be 0, otherwise include the length of the
12207                      * non-problematic character just found */
12208                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12209                 }
12210
12211                 /* Here, have found the final character, if any, that is
12212                  * non-problematic as far as ending the node without splitting
12213                  * it across a potential multi-char fold.  <len> contains the
12214                  * number of bytes in the node up-to and including that
12215                  * character, or is 0 if there is no such character, meaning
12216                  * the whole node contains only problematic characters.  In
12217                  * this case, give up and just take the node as-is.  We can't
12218                  * do any better */
12219                 if (len == 0) {
12220                     len = full_len;
12221
12222                     /* If the node ends in an 's' we make sure it stays EXACTF,
12223                      * as if it turns into an EXACTFU, it could later get
12224                      * joined with another 's' that would then wrongly match
12225                      * the sharp s */
12226                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12227                     {
12228                         maybe_exactfu = FALSE;
12229                     }
12230                 } else {
12231
12232                     /* Here, the node does contain some characters that aren't
12233                      * problematic.  If one such is the final character in the
12234                      * node, we are done */
12235                     if (len == full_len) {
12236                         goto loopdone;
12237                     }
12238                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12239
12240                         /* If the final character is problematic, but the
12241                          * penultimate is not, back-off that last character to
12242                          * later start a new node with it */
12243                         p = oldp;
12244                         goto loopdone;
12245                     }
12246
12247                     /* Here, the final non-problematic character is earlier
12248                      * in the input than the penultimate character.  What we do
12249                      * is reparse from the beginning, going up only as far as
12250                      * this final ok one, thus guaranteeing that the node ends
12251                      * in an acceptable character.  The reason we reparse is
12252                      * that we know how far in the character is, but we don't
12253                      * know how to correlate its position with the input parse.
12254                      * An alternate implementation would be to build that
12255                      * correlation as we go along during the original parse,
12256                      * but that would entail extra work for every node, whereas
12257                      * this code gets executed only when the string is too
12258                      * large for the node, and the final two characters are
12259                      * problematic, an infrequent occurrence.  Yet another
12260                      * possible strategy would be to save the tail of the
12261                      * string, and the next time regatom is called, initialize
12262                      * with that.  The problem with this is that unless you
12263                      * back off one more character, you won't be guaranteed
12264                      * regatom will get called again, unless regbranch,
12265                      * regpiece ... are also changed.  If you do back off that
12266                      * extra character, so that there is input guaranteed to
12267                      * force calling regatom, you can't handle the case where
12268                      * just the first character in the node is acceptable.  I
12269                      * (khw) decided to try this method which doesn't have that
12270                      * pitfall; if performance issues are found, we can do a
12271                      * combination of the current approach plus that one */
12272                     upper_parse = len;
12273                     len = 0;
12274                     s = s0;
12275                     goto reparse;
12276                 }
12277             }   /* End of verifying node ends with an appropriate char */
12278
12279         loopdone:   /* Jumped to when encounters something that shouldn't be in
12280                        the node */
12281
12282             /* I (khw) don't know if you can get here with zero length, but the
12283              * old code handled this situation by creating a zero-length EXACT
12284              * node.  Might as well be NOTHING instead */
12285             if (len == 0) {
12286                 OP(ret) = NOTHING;
12287             }
12288             else {
12289                 if (FOLD) {
12290                     /* If 'maybe_exact' is still set here, means there are no
12291                      * code points in the node that participate in folds;
12292                      * similarly for 'maybe_exactfu' and code points that match
12293                      * differently depending on UTF8ness of the target string
12294                      * (for /u), or depending on locale for /l */
12295                     if (maybe_exact) {
12296                         OP(ret) = EXACT;
12297                     }
12298                     else if (maybe_exactfu) {
12299                         OP(ret) = EXACTFU;
12300                     }
12301                 }
12302                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12303                                            FALSE /* Don't look to see if could
12304                                                     be turned into an EXACT
12305                                                     node, as we have already
12306                                                     computed that */
12307                                           );
12308             }
12309
12310             RExC_parse = p - 1;
12311             Set_Node_Cur_Length(ret, parse_start);
12312             nextchar(pRExC_state);
12313             {
12314                 /* len is STRLEN which is unsigned, need to copy to signed */
12315                 IV iv = len;
12316                 if (iv < 0)
12317                     vFAIL("Internal disaster");
12318             }
12319
12320         } /* End of label 'defchar:' */
12321         break;
12322     } /* End of giant switch on input character */
12323
12324     return(ret);
12325 }
12326
12327 STATIC char *
12328 S_regwhite( RExC_state_t *pRExC_state, char *p )
12329 {
12330     const char *e = RExC_end;
12331
12332     PERL_ARGS_ASSERT_REGWHITE;
12333
12334     while (p < e) {
12335         if (isSPACE(*p))
12336             ++p;
12337         else if (*p == '#') {
12338             bool ended = 0;
12339             do {
12340                 if (*p++ == '\n') {
12341                     ended = 1;
12342                     break;
12343                 }
12344             } while (p < e);
12345             if (!ended)
12346                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12347         }
12348         else
12349             break;
12350     }
12351     return p;
12352 }
12353
12354 STATIC char *
12355 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12356 {
12357     /* Returns the next non-pattern-white space, non-comment character (the
12358      * latter only if 'recognize_comment is true) in the string p, which is
12359      * ended by RExC_end.  If there is no line break ending a comment,
12360      * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12361     const char *e = RExC_end;
12362
12363     PERL_ARGS_ASSERT_REGPATWS;
12364
12365     while (p < e) {
12366         STRLEN len;
12367         if ((len = is_PATWS_safe(p, e, UTF))) {
12368             p += len;
12369         }
12370         else if (recognize_comment && *p == '#') {
12371             bool ended = 0;
12372             do {
12373                 p++;
12374                 if (is_LNBREAK_safe(p, e, UTF)) {
12375                     ended = 1;
12376                     break;
12377                 }
12378             } while (p < e);
12379             if (!ended)
12380                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12381         }
12382         else
12383             break;
12384     }
12385     return p;
12386 }
12387
12388 STATIC void
12389 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12390 {
12391     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12392      * sets up the bitmap and any flags, removing those code points from the
12393      * inversion list, setting it to NULL should it become completely empty */
12394
12395     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12396     assert(PL_regkind[OP(node)] == ANYOF);
12397
12398     ANYOF_BITMAP_ZERO(node);
12399     if (*invlist_ptr) {
12400
12401         /* This gets set if we actually need to modify things */
12402         bool change_invlist = FALSE;
12403
12404         UV start, end;
12405
12406         /* Start looking through *invlist_ptr */
12407         invlist_iterinit(*invlist_ptr);
12408         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12409             UV high;
12410             int i;
12411
12412             if (end == UV_MAX && start <= 256) {
12413                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12414             }
12415             else if (end >= 256) {
12416                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12417             }
12418
12419             /* Quit if are above what we should change */
12420             if (start > 255) {
12421                 break;
12422             }
12423
12424             change_invlist = TRUE;
12425
12426             /* Set all the bits in the range, up to the max that we are doing */
12427             high = (end < 255) ? end : 255;
12428             for (i = start; i <= (int) high; i++) {
12429                 if (! ANYOF_BITMAP_TEST(node, i)) {
12430                     ANYOF_BITMAP_SET(node, i);
12431                 }
12432             }
12433         }
12434         invlist_iterfinish(*invlist_ptr);
12435
12436         /* Done with loop; remove any code points that are in the bitmap from
12437          * *invlist_ptr; similarly for code points above latin1 if we have a
12438          * flag to match all of them anyways */
12439         if (change_invlist) {
12440             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12441         }
12442         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12443             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12444         }
12445
12446         /* If have completely emptied it, remove it completely */
12447         if (_invlist_len(*invlist_ptr) == 0) {
12448             SvREFCNT_dec_NN(*invlist_ptr);
12449             *invlist_ptr = NULL;
12450         }
12451     }
12452 }
12453
12454 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12455    Character classes ([:foo:]) can also be negated ([:^foo:]).
12456    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12457    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12458    but trigger failures because they are currently unimplemented. */
12459
12460 #define POSIXCC_DONE(c)   ((c) == ':')
12461 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12462 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12463
12464 PERL_STATIC_INLINE I32
12465 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12466 {
12467     dVAR;
12468     I32 namedclass = OOB_NAMEDCLASS;
12469
12470     PERL_ARGS_ASSERT_REGPPOSIXCC;
12471
12472     if (value == '[' && RExC_parse + 1 < RExC_end &&
12473         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12474         POSIXCC(UCHARAT(RExC_parse)))
12475     {
12476         const char c = UCHARAT(RExC_parse);
12477         char* const s = RExC_parse++;
12478
12479         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12480             RExC_parse++;
12481         if (RExC_parse == RExC_end) {
12482             if (strict) {
12483
12484                 /* Try to give a better location for the error (than the end of
12485                  * the string) by looking for the matching ']' */
12486                 RExC_parse = s;
12487                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12488                     RExC_parse++;
12489                 }
12490                 vFAIL2("Unmatched '%c' in POSIX class", c);
12491             }
12492             /* Grandfather lone [:, [=, [. */
12493             RExC_parse = s;
12494         }
12495         else {
12496             const char* const t = RExC_parse++; /* skip over the c */
12497             assert(*t == c);
12498
12499             if (UCHARAT(RExC_parse) == ']') {
12500                 const char *posixcc = s + 1;
12501                 RExC_parse++; /* skip over the ending ] */
12502
12503                 if (*s == ':') {
12504                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12505                     const I32 skip = t - posixcc;
12506
12507                     /* Initially switch on the length of the name.  */
12508                     switch (skip) {
12509                     case 4:
12510                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12511                                                           this is the Perl \w
12512                                                         */
12513                             namedclass = ANYOF_WORDCHAR;
12514                         break;
12515                     case 5:
12516                         /* Names all of length 5.  */
12517                         /* alnum alpha ascii blank cntrl digit graph lower
12518                            print punct space upper  */
12519                         /* Offset 4 gives the best switch position.  */
12520                         switch (posixcc[4]) {
12521                         case 'a':
12522                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12523                                 namedclass = ANYOF_ALPHA;
12524                             break;
12525                         case 'e':
12526                             if (memEQ(posixcc, "spac", 4)) /* space */
12527                                 namedclass = ANYOF_PSXSPC;
12528                             break;
12529                         case 'h':
12530                             if (memEQ(posixcc, "grap", 4)) /* graph */
12531                                 namedclass = ANYOF_GRAPH;
12532                             break;
12533                         case 'i':
12534                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12535                                 namedclass = ANYOF_ASCII;
12536                             break;
12537                         case 'k':
12538                             if (memEQ(posixcc, "blan", 4)) /* blank */
12539                                 namedclass = ANYOF_BLANK;
12540                             break;
12541                         case 'l':
12542                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12543                                 namedclass = ANYOF_CNTRL;
12544                             break;
12545                         case 'm':
12546                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12547                                 namedclass = ANYOF_ALPHANUMERIC;
12548                             break;
12549                         case 'r':
12550                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12551                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12552                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12553                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12554                             break;
12555                         case 't':
12556                             if (memEQ(posixcc, "digi", 4)) /* digit */
12557                                 namedclass = ANYOF_DIGIT;
12558                             else if (memEQ(posixcc, "prin", 4)) /* print */
12559                                 namedclass = ANYOF_PRINT;
12560                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12561                                 namedclass = ANYOF_PUNCT;
12562                             break;
12563                         }
12564                         break;
12565                     case 6:
12566                         if (memEQ(posixcc, "xdigit", 6))
12567                             namedclass = ANYOF_XDIGIT;
12568                         break;
12569                     }
12570
12571                     if (namedclass == OOB_NAMEDCLASS)
12572                         vFAIL2utf8f(
12573                             "POSIX class [:%"UTF8f":] unknown",
12574                             UTF8fARG(UTF, t - s - 1, s + 1));
12575
12576                     /* The #defines are structured so each complement is +1 to
12577                      * the normal one */
12578                     if (complement) {
12579                         namedclass++;
12580                     }
12581                     assert (posixcc[skip] == ':');
12582                     assert (posixcc[skip+1] == ']');
12583                 } else if (!SIZE_ONLY) {
12584                     /* [[=foo=]] and [[.foo.]] are still future. */
12585
12586                     /* adjust RExC_parse so the warning shows after
12587                        the class closes */
12588                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12589                         RExC_parse++;
12590                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12591                 }
12592             } else {
12593                 /* Maternal grandfather:
12594                  * "[:" ending in ":" but not in ":]" */
12595                 if (strict) {
12596                     vFAIL("Unmatched '[' in POSIX class");
12597                 }
12598
12599                 /* Grandfather lone [:, [=, [. */
12600                 RExC_parse = s;
12601             }
12602         }
12603     }
12604
12605     return namedclass;
12606 }
12607
12608 STATIC bool
12609 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12610 {
12611     /* This applies some heuristics at the current parse position (which should
12612      * be at a '[') to see if what follows might be intended to be a [:posix:]
12613      * class.  It returns true if it really is a posix class, of course, but it
12614      * also can return true if it thinks that what was intended was a posix
12615      * class that didn't quite make it.
12616      *
12617      * It will return true for
12618      *      [:alphanumerics:
12619      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12620      *                         ')' indicating the end of the (?[
12621      *      [:any garbage including %^&$ punctuation:]
12622      *
12623      * This is designed to be called only from S_handle_regex_sets; it could be
12624      * easily adapted to be called from the spot at the beginning of regclass()
12625      * that checks to see in a normal bracketed class if the surrounding []
12626      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12627      * change long-standing behavior, so I (khw) didn't do that */
12628     char* p = RExC_parse + 1;
12629     char first_char = *p;
12630
12631     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12632
12633     assert(*(p - 1) == '[');
12634
12635     if (! POSIXCC(first_char)) {
12636         return FALSE;
12637     }
12638
12639     p++;
12640     while (p < RExC_end && isWORDCHAR(*p)) p++;
12641
12642     if (p >= RExC_end) {
12643         return FALSE;
12644     }
12645
12646     if (p - RExC_parse > 2    /* Got at least 1 word character */
12647         && (*p == first_char
12648             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12649     {
12650         return TRUE;
12651     }
12652
12653     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12654
12655     return (p
12656             && p - RExC_parse > 2 /* [:] evaluates to colon;
12657                                       [::] is a bad posix class. */
12658             && first_char == *(p - 1));
12659 }
12660
12661 STATIC regnode *
12662 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12663                     I32 *flagp, U32 depth,
12664                     char * const oregcomp_parse)
12665 {
12666     /* Handle the (?[...]) construct to do set operations */
12667
12668     U8 curchar;
12669     UV start, end;      /* End points of code point ranges */
12670     SV* result_string;
12671     char *save_end, *save_parse;
12672     SV* final;
12673     STRLEN len;
12674     regnode* node;
12675     AV* stack;
12676     const bool save_fold = FOLD;
12677
12678     GET_RE_DEBUG_FLAGS_DECL;
12679
12680     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12681
12682     if (LOC) {
12683         vFAIL("(?[...]) not valid in locale");
12684     }
12685     RExC_uni_semantics = 1;
12686
12687     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12688      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12689      * call regclass to handle '[]' so as to not have to reinvent its parsing
12690      * rules here (throwing away the size it computes each time).  And, we exit
12691      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12692      * these things, we need to realize that something preceded by a backslash
12693      * is escaped, so we have to keep track of backslashes */
12694     if (SIZE_ONLY) {
12695         UV depth = 0; /* how many nested (?[...]) constructs */
12696
12697         Perl_ck_warner_d(aTHX_
12698             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12699             "The regex_sets feature is experimental" REPORT_LOCATION,
12700                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12701                 UTF8fARG(UTF,
12702                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12703                          RExC_precomp + (RExC_parse - RExC_precomp)));
12704
12705         while (RExC_parse < RExC_end) {
12706             SV* current = NULL;
12707             RExC_parse = regpatws(pRExC_state, RExC_parse,
12708                                 TRUE); /* means recognize comments */
12709             switch (*RExC_parse) {
12710                 case '?':
12711                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12712                     /* FALL THROUGH */
12713                 default:
12714                     break;
12715                 case '\\':
12716                     /* Skip the next byte (which could cause us to end up in
12717                      * the middle of a UTF-8 character, but since none of those
12718                      * are confusable with anything we currently handle in this
12719                      * switch (invariants all), it's safe.  We'll just hit the
12720                      * default: case next time and keep on incrementing until
12721                      * we find one of the invariants we do handle. */
12722                     RExC_parse++;
12723                     break;
12724                 case '[':
12725                 {
12726                     /* If this looks like it is a [:posix:] class, leave the
12727                      * parse pointer at the '[' to fool regclass() into
12728                      * thinking it is part of a '[[:posix:]]'.  That function
12729                      * will use strict checking to force a syntax error if it
12730                      * doesn't work out to a legitimate class */
12731                     bool is_posix_class
12732                                     = could_it_be_a_POSIX_class(pRExC_state);
12733                     if (! is_posix_class) {
12734                         RExC_parse++;
12735                     }
12736
12737                     /* regclass() can only return RESTART_UTF8 if multi-char
12738                        folds are allowed.  */
12739                     if (!regclass(pRExC_state, flagp,depth+1,
12740                                   is_posix_class, /* parse the whole char
12741                                                      class only if not a
12742                                                      posix class */
12743                                   FALSE, /* don't allow multi-char folds */
12744                                   TRUE, /* silence non-portable warnings. */
12745                                   &current))
12746                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12747                               (UV) *flagp);
12748
12749                     /* function call leaves parse pointing to the ']', except
12750                      * if we faked it */
12751                     if (is_posix_class) {
12752                         RExC_parse--;
12753                     }
12754
12755                     SvREFCNT_dec(current);   /* In case it returned something */
12756                     break;
12757                 }
12758
12759                 case ']':
12760                     if (depth--) break;
12761                     RExC_parse++;
12762                     if (RExC_parse < RExC_end
12763                         && *RExC_parse == ')')
12764                     {
12765                         node = reganode(pRExC_state, ANYOF, 0);
12766                         RExC_size += ANYOF_SKIP;
12767                         nextchar(pRExC_state);
12768                         Set_Node_Length(node,
12769                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12770                         return node;
12771                     }
12772                     goto no_close;
12773             }
12774             RExC_parse++;
12775         }
12776
12777         no_close:
12778         FAIL("Syntax error in (?[...])");
12779     }
12780
12781     /* Pass 2 only after this.  Everything in this construct is a
12782      * metacharacter.  Operands begin with either a '\' (for an escape
12783      * sequence), or a '[' for a bracketed character class.  Any other
12784      * character should be an operator, or parenthesis for grouping.  Both
12785      * types of operands are handled by calling regclass() to parse them.  It
12786      * is called with a parameter to indicate to return the computed inversion
12787      * list.  The parsing here is implemented via a stack.  Each entry on the
12788      * stack is a single character representing one of the operators, or the
12789      * '('; or else a pointer to an operand inversion list. */
12790
12791 #define IS_OPERAND(a)  (! SvIOK(a))
12792
12793     /* The stack starts empty.  It is a syntax error if the first thing parsed
12794      * is a binary operator; everything else is pushed on the stack.  When an
12795      * operand is parsed, the top of the stack is examined.  If it is a binary
12796      * operator, the item before it should be an operand, and both are replaced
12797      * by the result of doing that operation on the new operand and the one on
12798      * the stack.   Thus a sequence of binary operands is reduced to a single
12799      * one before the next one is parsed.
12800      *
12801      * A unary operator may immediately follow a binary in the input, for
12802      * example
12803      *      [a] + ! [b]
12804      * When an operand is parsed and the top of the stack is a unary operator,
12805      * the operation is performed, and then the stack is rechecked to see if
12806      * this new operand is part of a binary operation; if so, it is handled as
12807      * above.
12808      *
12809      * A '(' is simply pushed on the stack; it is valid only if the stack is
12810      * empty, or the top element of the stack is an operator or another '('
12811      * (for which the parenthesized expression will become an operand).  By the
12812      * time the corresponding ')' is parsed everything in between should have
12813      * been parsed and evaluated to a single operand (or else is a syntax
12814      * error), and is handled as a regular operand */
12815
12816     sv_2mortal((SV *)(stack = newAV()));
12817
12818     while (RExC_parse < RExC_end) {
12819         I32 top_index = av_tindex(stack);
12820         SV** top_ptr;
12821         SV* current = NULL;
12822
12823         /* Skip white space */
12824         RExC_parse = regpatws(pRExC_state, RExC_parse,
12825                                 TRUE); /* means recognize comments */
12826         if (RExC_parse >= RExC_end) {
12827             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12828         }
12829         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12830             break;
12831         }
12832
12833         switch (curchar) {
12834
12835             case '?':
12836                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12837                                                safely subtract 1 from
12838                                                RExC_parse in the next clause.
12839                                                If we have something on the
12840                                                stack, we have parsed something
12841                                              */
12842                     && UCHARAT(RExC_parse - 1) == '('
12843                     && RExC_parse < RExC_end)
12844                 {
12845                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12846                      * This happens when we have some thing like
12847                      *
12848                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12849                      *   ...
12850                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12851                      *
12852                      * Here we would be handling the interpolated
12853                      * '$thai_or_lao'.  We handle this by a recursive call to
12854                      * ourselves which returns the inversion list the
12855                      * interpolated expression evaluates to.  We use the flags
12856                      * from the interpolated pattern. */
12857                     U32 save_flags = RExC_flags;
12858                     const char * const save_parse = ++RExC_parse;
12859
12860                     parse_lparen_question_flags(pRExC_state);
12861
12862                     if (RExC_parse == save_parse  /* Makes sure there was at
12863                                                      least one flag (or this
12864                                                      embedding wasn't compiled)
12865                                                    */
12866                         || RExC_parse >= RExC_end - 4
12867                         || UCHARAT(RExC_parse) != ':'
12868                         || UCHARAT(++RExC_parse) != '('
12869                         || UCHARAT(++RExC_parse) != '?'
12870                         || UCHARAT(++RExC_parse) != '[')
12871                     {
12872
12873                         /* In combination with the above, this moves the
12874                          * pointer to the point just after the first erroneous
12875                          * character (or if there are no flags, to where they
12876                          * should have been) */
12877                         if (RExC_parse >= RExC_end - 4) {
12878                             RExC_parse = RExC_end;
12879                         }
12880                         else if (RExC_parse != save_parse) {
12881                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12882                         }
12883                         vFAIL("Expecting '(?flags:(?[...'");
12884                     }
12885                     RExC_parse++;
12886                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12887                                                     depth+1, oregcomp_parse);
12888
12889                     /* Here, 'current' contains the embedded expression's
12890                      * inversion list, and RExC_parse points to the trailing
12891                      * ']'; the next character should be the ')' which will be
12892                      * paired with the '(' that has been put on the stack, so
12893                      * the whole embedded expression reduces to '(operand)' */
12894                     RExC_parse++;
12895
12896                     RExC_flags = save_flags;
12897                     goto handle_operand;
12898                 }
12899                 /* FALL THROUGH */
12900
12901             default:
12902                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12903                 vFAIL("Unexpected character");
12904
12905             case '\\':
12906                 /* regclass() can only return RESTART_UTF8 if multi-char
12907                    folds are allowed.  */
12908                 if (!regclass(pRExC_state, flagp,depth+1,
12909                               TRUE, /* means parse just the next thing */
12910                               FALSE, /* don't allow multi-char folds */
12911                               FALSE, /* don't silence non-portable warnings.  */
12912                               &current))
12913                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12914                           (UV) *flagp);
12915                 /* regclass() will return with parsing just the \ sequence,
12916                  * leaving the parse pointer at the next thing to parse */
12917                 RExC_parse--;
12918                 goto handle_operand;
12919
12920             case '[':   /* Is a bracketed character class */
12921             {
12922                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12923
12924                 if (! is_posix_class) {
12925                     RExC_parse++;
12926                 }
12927
12928                 /* regclass() can only return RESTART_UTF8 if multi-char
12929                    folds are allowed.  */
12930                 if(!regclass(pRExC_state, flagp,depth+1,
12931                              is_posix_class, /* parse the whole char class
12932                                                 only if not a posix class */
12933                              FALSE, /* don't allow multi-char folds */
12934                              FALSE, /* don't silence non-portable warnings.  */
12935                              &current))
12936                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12937                           (UV) *flagp);
12938                 /* function call leaves parse pointing to the ']', except if we
12939                  * faked it */
12940                 if (is_posix_class) {
12941                     RExC_parse--;
12942                 }
12943
12944                 goto handle_operand;
12945             }
12946
12947             case '&':
12948             case '|':
12949             case '+':
12950             case '-':
12951             case '^':
12952                 if (top_index < 0
12953                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12954                     || ! IS_OPERAND(*top_ptr))
12955                 {
12956                     RExC_parse++;
12957                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12958                 }
12959                 av_push(stack, newSVuv(curchar));
12960                 break;
12961
12962             case '!':
12963                 av_push(stack, newSVuv(curchar));
12964                 break;
12965
12966             case '(':
12967                 if (top_index >= 0) {
12968                     top_ptr = av_fetch(stack, top_index, FALSE);
12969                     assert(top_ptr);
12970                     if (IS_OPERAND(*top_ptr)) {
12971                         RExC_parse++;
12972                         vFAIL("Unexpected '(' with no preceding operator");
12973                     }
12974                 }
12975                 av_push(stack, newSVuv(curchar));
12976                 break;
12977
12978             case ')':
12979             {
12980                 SV* lparen;
12981                 if (top_index < 1
12982                     || ! (current = av_pop(stack))
12983                     || ! IS_OPERAND(current)
12984                     || ! (lparen = av_pop(stack))
12985                     || IS_OPERAND(lparen)
12986                     || SvUV(lparen) != '(')
12987                 {
12988                     SvREFCNT_dec(current);
12989                     RExC_parse++;
12990                     vFAIL("Unexpected ')'");
12991                 }
12992                 top_index -= 2;
12993                 SvREFCNT_dec_NN(lparen);
12994
12995                 /* FALL THROUGH */
12996             }
12997
12998               handle_operand:
12999
13000                 /* Here, we have an operand to process, in 'current' */
13001
13002                 if (top_index < 0) {    /* Just push if stack is empty */
13003                     av_push(stack, current);
13004                 }
13005                 else {
13006                     SV* top = av_pop(stack);
13007                     SV *prev = NULL;
13008                     char current_operator;
13009
13010                     if (IS_OPERAND(top)) {
13011                         SvREFCNT_dec_NN(top);
13012                         SvREFCNT_dec_NN(current);
13013                         vFAIL("Operand with no preceding operator");
13014                     }
13015                     current_operator = (char) SvUV(top);
13016                     switch (current_operator) {
13017                         case '(':   /* Push the '(' back on followed by the new
13018                                        operand */
13019                             av_push(stack, top);
13020                             av_push(stack, current);
13021                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13022                                                    just after the 'break', so
13023                                                    it doesn't get wrongly freed
13024                                                  */
13025                             break;
13026
13027                         case '!':
13028                             _invlist_invert(current);
13029
13030                             /* Unlike binary operators, the top of the stack,
13031                              * now that this unary one has been popped off, may
13032                              * legally be an operator, and we now have operand
13033                              * for it. */
13034                             top_index--;
13035                             SvREFCNT_dec_NN(top);
13036                             goto handle_operand;
13037
13038                         case '&':
13039                             prev = av_pop(stack);
13040                             _invlist_intersection(prev,
13041                                                    current,
13042                                                    &current);
13043                             av_push(stack, current);
13044                             break;
13045
13046                         case '|':
13047                         case '+':
13048                             prev = av_pop(stack);
13049                             _invlist_union(prev, current, &current);
13050                             av_push(stack, current);
13051                             break;
13052
13053                         case '-':
13054                             prev = av_pop(stack);;
13055                             _invlist_subtract(prev, current, &current);
13056                             av_push(stack, current);
13057                             break;
13058
13059                         case '^':   /* The union minus the intersection */
13060                         {
13061                             SV* i = NULL;
13062                             SV* u = NULL;
13063                             SV* element;
13064
13065                             prev = av_pop(stack);
13066                             _invlist_union(prev, current, &u);
13067                             _invlist_intersection(prev, current, &i);
13068                             /* _invlist_subtract will overwrite current
13069                                 without freeing what it already contains */
13070                             element = current;
13071                             _invlist_subtract(u, i, &current);
13072                             av_push(stack, current);
13073                             SvREFCNT_dec_NN(i);
13074                             SvREFCNT_dec_NN(u);
13075                             SvREFCNT_dec_NN(element);
13076                             break;
13077                         }
13078
13079                         default:
13080                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13081                 }
13082                 SvREFCNT_dec_NN(top);
13083                 SvREFCNT_dec(prev);
13084             }
13085         }
13086
13087         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13088     }
13089
13090     if (av_tindex(stack) < 0   /* Was empty */
13091         || ((final = av_pop(stack)) == NULL)
13092         || ! IS_OPERAND(final)
13093         || av_tindex(stack) >= 0)  /* More left on stack */
13094     {
13095         vFAIL("Incomplete expression within '(?[ ])'");
13096     }
13097
13098     /* Here, 'final' is the resultant inversion list from evaluating the
13099      * expression.  Return it if so requested */
13100     if (return_invlist) {
13101         *return_invlist = final;
13102         return END;
13103     }
13104
13105     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13106      * expecting a string of ranges and individual code points */
13107     invlist_iterinit(final);
13108     result_string = newSVpvs("");
13109     while (invlist_iternext(final, &start, &end)) {
13110         if (start == end) {
13111             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13112         }
13113         else {
13114             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13115                                                      start,          end);
13116         }
13117     }
13118
13119     save_parse = RExC_parse;
13120     RExC_parse = SvPV(result_string, len);
13121     save_end = RExC_end;
13122     RExC_end = RExC_parse + len;
13123
13124     /* We turn off folding around the call, as the class we have constructed
13125      * already has all folding taken into consideration, and we don't want
13126      * regclass() to add to that */
13127     RExC_flags &= ~RXf_PMf_FOLD;
13128     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13129      */
13130     node = regclass(pRExC_state, flagp,depth+1,
13131                     FALSE, /* means parse the whole char class */
13132                     FALSE, /* don't allow multi-char folds */
13133                     TRUE, /* silence non-portable warnings.  The above may very
13134                              well have generated non-portable code points, but
13135                              they're valid on this machine */
13136                     NULL);
13137     if (!node)
13138         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13139                     PTR2UV(flagp));
13140     if (save_fold) {
13141         RExC_flags |= RXf_PMf_FOLD;
13142     }
13143     RExC_parse = save_parse + 1;
13144     RExC_end = save_end;
13145     SvREFCNT_dec_NN(final);
13146     SvREFCNT_dec_NN(result_string);
13147
13148     nextchar(pRExC_state);
13149     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13150     return node;
13151 }
13152 #undef IS_OPERAND
13153
13154 /* The names of properties whose definitions are not known at compile time are
13155  * stored in this SV, after a constant heading.  So if the length has been
13156  * changed since initialization, then there is a run-time definition. */
13157 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13158                                         (SvCUR(listsv) != initial_listsv_len)
13159
13160 STATIC regnode *
13161 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13162                  const bool stop_at_1,  /* Just parse the next thing, don't
13163                                            look for a full character class */
13164                  bool allow_multi_folds,
13165                  const bool silence_non_portable,   /* Don't output warnings
13166                                                        about too large
13167                                                        characters */
13168                  SV** ret_invlist)  /* Return an inversion list, not a node */
13169 {
13170     /* parse a bracketed class specification.  Most of these will produce an
13171      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13172      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13173      * under /i with multi-character folds: it will be rewritten following the
13174      * paradigm of this example, where the <multi-fold>s are characters which
13175      * fold to multiple character sequences:
13176      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13177      * gets effectively rewritten as:
13178      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13179      * reg() gets called (recursively) on the rewritten version, and this
13180      * function will return what it constructs.  (Actually the <multi-fold>s
13181      * aren't physically removed from the [abcdefghi], it's just that they are
13182      * ignored in the recursion by means of a flag:
13183      * <RExC_in_multi_char_class>.)
13184      *
13185      * ANYOF nodes contain a bit map for the first 256 characters, with the
13186      * corresponding bit set if that character is in the list.  For characters
13187      * above 255, a range list or swash is used.  There are extra bits for \w,
13188      * etc. in locale ANYOFs, as what these match is not determinable at
13189      * compile time
13190      *
13191      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13192      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13193      */
13194
13195     dVAR;
13196     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13197     IV range = 0;
13198     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13199     regnode *ret;
13200     STRLEN numlen;
13201     IV namedclass = OOB_NAMEDCLASS;
13202     char *rangebegin = NULL;
13203     bool need_class = 0;
13204     SV *listsv = NULL;
13205     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13206                                       than just initialized.  */
13207     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13208     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13209                                extended beyond the Latin1 range.  These have to
13210                                be kept separate from other code points for much
13211                                of this function because their handling  is
13212                                different under /i, and for most classes under
13213                                /d as well */
13214     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13215                                separate for a while from the non-complemented
13216                                versions because of complications with /d
13217                                matching */
13218     UV element_count = 0;   /* Number of distinct elements in the class.
13219                                Optimizations may be possible if this is tiny */
13220     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13221                                        character; used under /i */
13222     UV n;
13223     char * stop_ptr = RExC_end;    /* where to stop parsing */
13224     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13225                                                    space? */
13226     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13227
13228     /* Unicode properties are stored in a swash; this holds the current one
13229      * being parsed.  If this swash is the only above-latin1 component of the
13230      * character class, an optimization is to pass it directly on to the
13231      * execution engine.  Otherwise, it is set to NULL to indicate that there
13232      * are other things in the class that have to be dealt with at execution
13233      * time */
13234     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13235
13236     /* Set if a component of this character class is user-defined; just passed
13237      * on to the engine */
13238     bool has_user_defined_property = FALSE;
13239
13240     /* inversion list of code points this node matches only when the target
13241      * string is in UTF-8.  (Because is under /d) */
13242     SV* depends_list = NULL;
13243
13244     /* Inversion list of code points this node matches regardless of things
13245      * like locale, folding, utf8ness of the target string */
13246     SV* cp_list = NULL;
13247
13248     /* Like cp_list, but code points on this list need to be checked for things
13249      * that fold to/from them under /i */
13250     SV* cp_foldable_list = NULL;
13251
13252     /* Like cp_list, but code points on this list are valid only when the
13253      * runtime locale is UTF-8 */
13254     SV* only_utf8_locale_list = NULL;
13255
13256 #ifdef EBCDIC
13257     /* In a range, counts how many 0-2 of the ends of it came from literals,
13258      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13259     UV literal_endpoint = 0;
13260 #endif
13261     bool invert = FALSE;    /* Is this class to be complemented */
13262
13263     bool warn_super = ALWAYS_WARN_SUPER;
13264
13265     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13266         case we need to change the emitted regop to an EXACT. */
13267     const char * orig_parse = RExC_parse;
13268     const SSize_t orig_size = RExC_size;
13269     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13270     GET_RE_DEBUG_FLAGS_DECL;
13271
13272     PERL_ARGS_ASSERT_REGCLASS;
13273 #ifndef DEBUGGING
13274     PERL_UNUSED_ARG(depth);
13275 #endif
13276
13277     DEBUG_PARSE("clas");
13278
13279     /* Assume we are going to generate an ANYOF node. */
13280     ret = reganode(pRExC_state, ANYOF, 0);
13281
13282     if (SIZE_ONLY) {
13283         RExC_size += ANYOF_SKIP;
13284         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13285     }
13286     else {
13287         ANYOF_FLAGS(ret) = 0;
13288
13289         RExC_emit += ANYOF_SKIP;
13290         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13291         initial_listsv_len = SvCUR(listsv);
13292         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13293     }
13294
13295     if (skip_white) {
13296         RExC_parse = regpatws(pRExC_state, RExC_parse,
13297                               FALSE /* means don't recognize comments */);
13298     }
13299
13300     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13301         RExC_parse++;
13302         invert = TRUE;
13303         allow_multi_folds = FALSE;
13304         RExC_naughty++;
13305         if (skip_white) {
13306             RExC_parse = regpatws(pRExC_state, RExC_parse,
13307                                   FALSE /* means don't recognize comments */);
13308         }
13309     }
13310
13311     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13312     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13313         const char *s = RExC_parse;
13314         const char  c = *s++;
13315
13316         while (isWORDCHAR(*s))
13317             s++;
13318         if (*s && c == *s && s[1] == ']') {
13319             SAVEFREESV(RExC_rx_sv);
13320             ckWARN3reg(s+2,
13321                        "POSIX syntax [%c %c] belongs inside character classes",
13322                        c, c);
13323             (void)ReREFCNT_inc(RExC_rx_sv);
13324         }
13325     }
13326
13327     /* If the caller wants us to just parse a single element, accomplish this
13328      * by faking the loop ending condition */
13329     if (stop_at_1 && RExC_end > RExC_parse) {
13330         stop_ptr = RExC_parse + 1;
13331     }
13332
13333     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13334     if (UCHARAT(RExC_parse) == ']')
13335         goto charclassloop;
13336
13337 parseit:
13338     while (1) {
13339         if  (RExC_parse >= stop_ptr) {
13340             break;
13341         }
13342
13343         if (skip_white) {
13344             RExC_parse = regpatws(pRExC_state, RExC_parse,
13345                                   FALSE /* means don't recognize comments */);
13346         }
13347
13348         if  (UCHARAT(RExC_parse) == ']') {
13349             break;
13350         }
13351
13352     charclassloop:
13353
13354         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13355         save_value = value;
13356         save_prevvalue = prevvalue;
13357
13358         if (!range) {
13359             rangebegin = RExC_parse;
13360             element_count++;
13361         }
13362         if (UTF) {
13363             value = utf8n_to_uvchr((U8*)RExC_parse,
13364                                    RExC_end - RExC_parse,
13365                                    &numlen, UTF8_ALLOW_DEFAULT);
13366             RExC_parse += numlen;
13367         }
13368         else
13369             value = UCHARAT(RExC_parse++);
13370
13371         if (value == '['
13372             && RExC_parse < RExC_end
13373             && POSIXCC(UCHARAT(RExC_parse)))
13374         {
13375             namedclass = regpposixcc(pRExC_state, value, strict);
13376         }
13377         else if (value == '\\') {
13378             if (UTF) {
13379                 value = utf8n_to_uvchr((U8*)RExC_parse,
13380                                    RExC_end - RExC_parse,
13381                                    &numlen, UTF8_ALLOW_DEFAULT);
13382                 RExC_parse += numlen;
13383             }
13384             else
13385                 value = UCHARAT(RExC_parse++);
13386
13387             /* Some compilers cannot handle switching on 64-bit integer
13388              * values, therefore value cannot be an UV.  Yes, this will
13389              * be a problem later if we want switch on Unicode.
13390              * A similar issue a little bit later when switching on
13391              * namedclass. --jhi */
13392
13393             /* If the \ is escaping white space when white space is being
13394              * skipped, it means that that white space is wanted literally, and
13395              * is already in 'value'.  Otherwise, need to translate the escape
13396              * into what it signifies. */
13397             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13398
13399             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13400             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13401             case 's':   namedclass = ANYOF_SPACE;       break;
13402             case 'S':   namedclass = ANYOF_NSPACE;      break;
13403             case 'd':   namedclass = ANYOF_DIGIT;       break;
13404             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13405             case 'v':   namedclass = ANYOF_VERTWS;      break;
13406             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13407             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13408             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13409             case 'N':  /* Handle \N{NAME} in class */
13410                 {
13411                     /* We only pay attention to the first char of
13412                     multichar strings being returned. I kinda wonder
13413                     if this makes sense as it does change the behaviour
13414                     from earlier versions, OTOH that behaviour was broken
13415                     as well. */
13416                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13417                                       TRUE, /* => charclass */
13418                                       strict))
13419                     {
13420                         if (*flagp & RESTART_UTF8)
13421                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13422                         goto parseit;
13423                     }
13424                 }
13425                 break;
13426             case 'p':
13427             case 'P':
13428                 {
13429                 char *e;
13430
13431                 /* We will handle any undefined properties ourselves */
13432                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13433                                        /* And we actually would prefer to get
13434                                         * the straight inversion list of the
13435                                         * swash, since we will be accessing it
13436                                         * anyway, to save a little time */
13437                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13438
13439                 if (RExC_parse >= RExC_end)
13440                     vFAIL2("Empty \\%c{}", (U8)value);
13441                 if (*RExC_parse == '{') {
13442                     const U8 c = (U8)value;
13443                     e = strchr(RExC_parse++, '}');
13444                     if (!e)
13445                         vFAIL2("Missing right brace on \\%c{}", c);
13446                     while (isSPACE(UCHARAT(RExC_parse)))
13447                         RExC_parse++;
13448                     if (e == RExC_parse)
13449                         vFAIL2("Empty \\%c{}", c);
13450                     n = e - RExC_parse;
13451                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13452                         n--;
13453                 }
13454                 else {
13455                     e = RExC_parse;
13456                     n = 1;
13457                 }
13458                 if (!SIZE_ONLY) {
13459                     SV* invlist;
13460                     char* formatted;
13461                     char* name;
13462
13463                     if (UCHARAT(RExC_parse) == '^') {
13464                          RExC_parse++;
13465                          n--;
13466                          /* toggle.  (The rhs xor gets the single bit that
13467                           * differs between P and p; the other xor inverts just
13468                           * that bit) */
13469                          value ^= 'P' ^ 'p';
13470
13471                          while (isSPACE(UCHARAT(RExC_parse))) {
13472                               RExC_parse++;
13473                               n--;
13474                          }
13475                     }
13476                     /* Try to get the definition of the property into
13477                      * <invlist>.  If /i is in effect, the effective property
13478                      * will have its name be <__NAME_i>.  The design is
13479                      * discussed in commit
13480                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13481                     formatted = Perl_form(aTHX_
13482                                           "%s%.*s%s\n",
13483                                           (FOLD) ? "__" : "",
13484                                           (int)n,
13485                                           RExC_parse,
13486                                           (FOLD) ? "_i" : ""
13487                                 );
13488                     name = savepvn(formatted, strlen(formatted));
13489
13490                     /* Look up the property name, and get its swash and
13491                      * inversion list, if the property is found  */
13492                     if (swash) {
13493                         SvREFCNT_dec_NN(swash);
13494                     }
13495                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13496                                              1, /* binary */
13497                                              0, /* not tr/// */
13498                                              NULL, /* No inversion list */
13499                                              &swash_init_flags
13500                                             );
13501                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13502                         if (swash) {
13503                             SvREFCNT_dec_NN(swash);
13504                             swash = NULL;
13505                         }
13506
13507                         /* Here didn't find it.  It could be a user-defined
13508                          * property that will be available at run-time.  If we
13509                          * accept only compile-time properties, is an error;
13510                          * otherwise add it to the list for run-time look up */
13511                         if (ret_invlist) {
13512                             RExC_parse = e + 1;
13513                             vFAIL2utf8f(
13514                                 "Property '%"UTF8f"' is unknown",
13515                                 UTF8fARG(UTF, n, name));
13516                         }
13517                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13518                                         (value == 'p' ? '+' : '!'),
13519                                         UTF8fARG(UTF, n, name));
13520                         has_user_defined_property = TRUE;
13521
13522                         /* We don't know yet, so have to assume that the
13523                          * property could match something in the Latin1 range,
13524                          * hence something that isn't utf8.  Note that this
13525                          * would cause things in <depends_list> to match
13526                          * inappropriately, except that any \p{}, including
13527                          * this one forces Unicode semantics, which means there
13528                          * is no <depends_list> */
13529                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13530                     }
13531                     else {
13532
13533                         /* Here, did get the swash and its inversion list.  If
13534                          * the swash is from a user-defined property, then this
13535                          * whole character class should be regarded as such */
13536                         if (swash_init_flags
13537                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13538                         {
13539                             has_user_defined_property = TRUE;
13540                         }
13541                         else if
13542                             /* We warn on matching an above-Unicode code point
13543                              * if the match would return true, except don't
13544                              * warn for \p{All}, which has exactly one element
13545                              * = 0 */
13546                             (_invlist_contains_cp(invlist, 0x110000)
13547                                 && (! (_invlist_len(invlist) == 1
13548                                        && *invlist_array(invlist) == 0)))
13549                         {
13550                             warn_super = TRUE;
13551                         }
13552
13553
13554                         /* Invert if asking for the complement */
13555                         if (value == 'P') {
13556                             _invlist_union_complement_2nd(properties,
13557                                                           invlist,
13558                                                           &properties);
13559
13560                             /* The swash can't be used as-is, because we've
13561                              * inverted things; delay removing it to here after
13562                              * have copied its invlist above */
13563                             SvREFCNT_dec_NN(swash);
13564                             swash = NULL;
13565                         }
13566                         else {
13567                             _invlist_union(properties, invlist, &properties);
13568                         }
13569                     }
13570                     Safefree(name);
13571                 }
13572                 RExC_parse = e + 1;
13573                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13574                                                 named */
13575
13576                 /* \p means they want Unicode semantics */
13577                 RExC_uni_semantics = 1;
13578                 }
13579                 break;
13580             case 'n':   value = '\n';                   break;
13581             case 'r':   value = '\r';                   break;
13582             case 't':   value = '\t';                   break;
13583             case 'f':   value = '\f';                   break;
13584             case 'b':   value = '\b';                   break;
13585             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13586             case 'a':   value = '\a';                   break;
13587             case 'o':
13588                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13589                 {
13590                     const char* error_msg;
13591                     bool valid = grok_bslash_o(&RExC_parse,
13592                                                &value,
13593                                                &error_msg,
13594                                                SIZE_ONLY,   /* warnings in pass
13595                                                                1 only */
13596                                                strict,
13597                                                silence_non_portable,
13598                                                UTF);
13599                     if (! valid) {
13600                         vFAIL(error_msg);
13601                     }
13602                 }
13603                 if (PL_encoding && value < 0x100) {
13604                     goto recode_encoding;
13605                 }
13606                 break;
13607             case 'x':
13608                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13609                 {
13610                     const char* error_msg;
13611                     bool valid = grok_bslash_x(&RExC_parse,
13612                                                &value,
13613                                                &error_msg,
13614                                                TRUE, /* Output warnings */
13615                                                strict,
13616                                                silence_non_portable,
13617                                                UTF);
13618                     if (! valid) {
13619                         vFAIL(error_msg);
13620                     }
13621                 }
13622                 if (PL_encoding && value < 0x100)
13623                     goto recode_encoding;
13624                 break;
13625             case 'c':
13626                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13627                 break;
13628             case '0': case '1': case '2': case '3': case '4':
13629             case '5': case '6': case '7':
13630                 {
13631                     /* Take 1-3 octal digits */
13632                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13633                     numlen = (strict) ? 4 : 3;
13634                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13635                     RExC_parse += numlen;
13636                     if (numlen != 3) {
13637                         if (strict) {
13638                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13639                             vFAIL("Need exactly 3 octal digits");
13640                         }
13641                         else if (! SIZE_ONLY /* like \08, \178 */
13642                                  && numlen < 3
13643                                  && RExC_parse < RExC_end
13644                                  && isDIGIT(*RExC_parse)
13645                                  && ckWARN(WARN_REGEXP))
13646                         {
13647                             SAVEFREESV(RExC_rx_sv);
13648                             reg_warn_non_literal_string(
13649                                  RExC_parse + 1,
13650                                  form_short_octal_warning(RExC_parse, numlen));
13651                             (void)ReREFCNT_inc(RExC_rx_sv);
13652                         }
13653                     }
13654                     if (PL_encoding && value < 0x100)
13655                         goto recode_encoding;
13656                     break;
13657                 }
13658             recode_encoding:
13659                 if (! RExC_override_recoding) {
13660                     SV* enc = PL_encoding;
13661                     value = reg_recode((const char)(U8)value, &enc);
13662                     if (!enc) {
13663                         if (strict) {
13664                             vFAIL("Invalid escape in the specified encoding");
13665                         }
13666                         else if (SIZE_ONLY) {
13667                             ckWARNreg(RExC_parse,
13668                                   "Invalid escape in the specified encoding");
13669                         }
13670                     }
13671                     break;
13672                 }
13673             default:
13674                 /* Allow \_ to not give an error */
13675                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13676                     if (strict) {
13677                         vFAIL2("Unrecognized escape \\%c in character class",
13678                                (int)value);
13679                     }
13680                     else {
13681                         SAVEFREESV(RExC_rx_sv);
13682                         ckWARN2reg(RExC_parse,
13683                             "Unrecognized escape \\%c in character class passed through",
13684                             (int)value);
13685                         (void)ReREFCNT_inc(RExC_rx_sv);
13686                     }
13687                 }
13688                 break;
13689             }   /* End of switch on char following backslash */
13690         } /* end of handling backslash escape sequences */
13691 #ifdef EBCDIC
13692         else
13693             literal_endpoint++;
13694 #endif
13695
13696         /* Here, we have the current token in 'value' */
13697
13698         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13699             U8 classnum;
13700
13701             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13702              * literal, as is the character that began the false range, i.e.
13703              * the 'a' in the examples */
13704             if (range) {
13705                 if (!SIZE_ONLY) {
13706                     const int w = (RExC_parse >= rangebegin)
13707                                   ? RExC_parse - rangebegin
13708                                   : 0;
13709                     if (strict) {
13710                         vFAIL2utf8f(
13711                             "False [] range \"%"UTF8f"\"",
13712                             UTF8fARG(UTF, w, rangebegin));
13713                     }
13714                     else {
13715                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13716                         ckWARN2reg(RExC_parse,
13717                             "False [] range \"%"UTF8f"\"",
13718                             UTF8fARG(UTF, w, rangebegin));
13719                         (void)ReREFCNT_inc(RExC_rx_sv);
13720                         cp_list = add_cp_to_invlist(cp_list, '-');
13721                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13722                                                              prevvalue);
13723                     }
13724                 }
13725
13726                 range = 0; /* this was not a true range */
13727                 element_count += 2; /* So counts for three values */
13728             }
13729
13730             classnum = namedclass_to_classnum(namedclass);
13731
13732             if (LOC && namedclass < ANYOF_POSIXL_MAX
13733 #ifndef HAS_ISASCII
13734                 && classnum != _CC_ASCII
13735 #endif
13736             ) {
13737                 /* What the Posix classes (like \w, [:space:]) match in locale
13738                  * isn't knowable under locale until actual match time.  Room
13739                  * must be reserved (one time per outer bracketed class) to
13740                  * store such classes.  The space will contain a bit for each
13741                  * named class that is to be matched against.  This isn't
13742                  * needed for \p{} and pseudo-classes, as they are not affected
13743                  * by locale, and hence are dealt with separately */
13744                 if (! need_class) {
13745                     need_class = 1;
13746                     if (SIZE_ONLY) {
13747                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13748                     }
13749                     else {
13750                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13751                     }
13752                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13753                     ANYOF_POSIXL_ZERO(ret);
13754                 }
13755
13756                 /* See if it already matches the complement of this POSIX
13757                  * class */
13758                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13759                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13760                                                             ? -1
13761                                                             : 1)))
13762                 {
13763                     posixl_matches_all = TRUE;
13764                     break;  /* No need to continue.  Since it matches both
13765                                e.g., \w and \W, it matches everything, and the
13766                                bracketed class can be optimized into qr/./s */
13767                 }
13768
13769                 /* Add this class to those that should be checked at runtime */
13770                 ANYOF_POSIXL_SET(ret, namedclass);
13771
13772                 /* The above-Latin1 characters are not subject to locale rules.
13773                  * Just add them, in the second pass, to the
13774                  * unconditionally-matched list */
13775                 if (! SIZE_ONLY) {
13776                     SV* scratch_list = NULL;
13777
13778                     /* Get the list of the above-Latin1 code points this
13779                      * matches */
13780                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13781                                           PL_XPosix_ptrs[classnum],
13782
13783                                           /* Odd numbers are complements, like
13784                                            * NDIGIT, NASCII, ... */
13785                                           namedclass % 2 != 0,
13786                                           &scratch_list);
13787                     /* Checking if 'cp_list' is NULL first saves an extra
13788                      * clone.  Its reference count will be decremented at the
13789                      * next union, etc, or if this is the only instance, at the
13790                      * end of the routine */
13791                     if (! cp_list) {
13792                         cp_list = scratch_list;
13793                     }
13794                     else {
13795                         _invlist_union(cp_list, scratch_list, &cp_list);
13796                         SvREFCNT_dec_NN(scratch_list);
13797                     }
13798                     continue;   /* Go get next character */
13799                 }
13800             }
13801             else if (! SIZE_ONLY) {
13802
13803                 /* Here, not in pass1 (in that pass we skip calculating the
13804                  * contents of this class), and is /l, or is a POSIX class for
13805                  * which /l doesn't matter (or is a Unicode property, which is
13806                  * skipped here). */
13807                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13808                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13809
13810                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13811                          * nor /l make a difference in what these match,
13812                          * therefore we just add what they match to cp_list. */
13813                         if (classnum != _CC_VERTSPACE) {
13814                             assert(   namedclass == ANYOF_HORIZWS
13815                                    || namedclass == ANYOF_NHORIZWS);
13816
13817                             /* It turns out that \h is just a synonym for
13818                              * XPosixBlank */
13819                             classnum = _CC_BLANK;
13820                         }
13821
13822                         _invlist_union_maybe_complement_2nd(
13823                                 cp_list,
13824                                 PL_XPosix_ptrs[classnum],
13825                                 namedclass % 2 != 0,    /* Complement if odd
13826                                                           (NHORIZWS, NVERTWS)
13827                                                         */
13828                                 &cp_list);
13829                     }
13830                 }
13831                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13832                            complement and use nposixes */
13833                     SV** posixes_ptr = namedclass % 2 == 0
13834                                        ? &posixes
13835                                        : &nposixes;
13836                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13837                     _invlist_union_maybe_complement_2nd(
13838                                                      *posixes_ptr,
13839                                                      *source_ptr,
13840                                                      namedclass % 2 != 0,
13841                                                      posixes_ptr);
13842                 }
13843                 continue;   /* Go get next character */
13844             }
13845         } /* end of namedclass \blah */
13846
13847         /* Here, we have a single value.  If 'range' is set, it is the ending
13848          * of a range--check its validity.  Later, we will handle each
13849          * individual code point in the range.  If 'range' isn't set, this
13850          * could be the beginning of a range, so check for that by looking
13851          * ahead to see if the next real character to be processed is the range
13852          * indicator--the minus sign */
13853
13854         if (skip_white) {
13855             RExC_parse = regpatws(pRExC_state, RExC_parse,
13856                                 FALSE /* means don't recognize comments */);
13857         }
13858
13859         if (range) {
13860             if (prevvalue > value) /* b-a */ {
13861                 const int w = RExC_parse - rangebegin;
13862                 vFAIL2utf8f(
13863                     "Invalid [] range \"%"UTF8f"\"",
13864                     UTF8fARG(UTF, w, rangebegin));
13865                 range = 0; /* not a valid range */
13866             }
13867         }
13868         else {
13869             prevvalue = value; /* save the beginning of the potential range */
13870             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13871                 && *RExC_parse == '-')
13872             {
13873                 char* next_char_ptr = RExC_parse + 1;
13874                 if (skip_white) {   /* Get the next real char after the '-' */
13875                     next_char_ptr = regpatws(pRExC_state,
13876                                              RExC_parse + 1,
13877                                              FALSE); /* means don't recognize
13878                                                         comments */
13879                 }
13880
13881                 /* If the '-' is at the end of the class (just before the ']',
13882                  * it is a literal minus; otherwise it is a range */
13883                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13884                     RExC_parse = next_char_ptr;
13885
13886                     /* a bad range like \w-, [:word:]- ? */
13887                     if (namedclass > OOB_NAMEDCLASS) {
13888                         if (strict || ckWARN(WARN_REGEXP)) {
13889                             const int w =
13890                                 RExC_parse >= rangebegin ?
13891                                 RExC_parse - rangebegin : 0;
13892                             if (strict) {
13893                                 vFAIL4("False [] range \"%*.*s\"",
13894                                     w, w, rangebegin);
13895                             }
13896                             else {
13897                                 vWARN4(RExC_parse,
13898                                     "False [] range \"%*.*s\"",
13899                                     w, w, rangebegin);
13900                             }
13901                         }
13902                         if (!SIZE_ONLY) {
13903                             cp_list = add_cp_to_invlist(cp_list, '-');
13904                         }
13905                         element_count++;
13906                     } else
13907                         range = 1;      /* yeah, it's a range! */
13908                     continue;   /* but do it the next time */
13909                 }
13910             }
13911         }
13912
13913         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13914          * if not */
13915
13916         /* non-Latin1 code point implies unicode semantics.  Must be set in
13917          * pass1 so is there for the whole of pass 2 */
13918         if (value > 255) {
13919             RExC_uni_semantics = 1;
13920         }
13921
13922         /* Ready to process either the single value, or the completed range.
13923          * For single-valued non-inverted ranges, we consider the possibility
13924          * of multi-char folds.  (We made a conscious decision to not do this
13925          * for the other cases because it can often lead to non-intuitive
13926          * results.  For example, you have the peculiar case that:
13927          *  "s s" =~ /^[^\xDF]+$/i => Y
13928          *  "ss"  =~ /^[^\xDF]+$/i => N
13929          *
13930          * See [perl #89750] */
13931         if (FOLD && allow_multi_folds && value == prevvalue) {
13932             if (value == LATIN_SMALL_LETTER_SHARP_S
13933                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13934                                                         value)))
13935             {
13936                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13937
13938                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13939                 STRLEN foldlen;
13940
13941                 UV folded = _to_uni_fold_flags(
13942                                 value,
13943                                 foldbuf,
13944                                 &foldlen,
13945                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13946                                                    ? FOLD_FLAGS_NOMIX_ASCII
13947                                                    : 0)
13948                                 );
13949
13950                 /* Here, <folded> should be the first character of the
13951                  * multi-char fold of <value>, with <foldbuf> containing the
13952                  * whole thing.  But, if this fold is not allowed (because of
13953                  * the flags), <fold> will be the same as <value>, and should
13954                  * be processed like any other character, so skip the special
13955                  * handling */
13956                 if (folded != value) {
13957
13958                     /* Skip if we are recursed, currently parsing the class
13959                      * again.  Otherwise add this character to the list of
13960                      * multi-char folds. */
13961                     if (! RExC_in_multi_char_class) {
13962                         AV** this_array_ptr;
13963                         AV* this_array;
13964                         STRLEN cp_count = utf8_length(foldbuf,
13965                                                       foldbuf + foldlen);
13966                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13967
13968                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13969
13970
13971                         if (! multi_char_matches) {
13972                             multi_char_matches = newAV();
13973                         }
13974
13975                         /* <multi_char_matches> is actually an array of arrays.
13976                          * There will be one or two top-level elements: [2],
13977                          * and/or [3].  The [2] element is an array, each
13978                          * element thereof is a character which folds to TWO
13979                          * characters; [3] is for folds to THREE characters.
13980                          * (Unicode guarantees a maximum of 3 characters in any
13981                          * fold.)  When we rewrite the character class below,
13982                          * we will do so such that the longest folds are
13983                          * written first, so that it prefers the longest
13984                          * matching strings first.  This is done even if it
13985                          * turns out that any quantifier is non-greedy, out of
13986                          * programmer laziness.  Tom Christiansen has agreed
13987                          * that this is ok.  This makes the test for the
13988                          * ligature 'ffi' come before the test for 'ff' */
13989                         if (av_exists(multi_char_matches, cp_count)) {
13990                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
13991                                                              cp_count, FALSE);
13992                             this_array = *this_array_ptr;
13993                         }
13994                         else {
13995                             this_array = newAV();
13996                             av_store(multi_char_matches, cp_count,
13997                                      (SV*) this_array);
13998                         }
13999                         av_push(this_array, multi_fold);
14000                     }
14001
14002                     /* This element should not be processed further in this
14003                      * class */
14004                     element_count--;
14005                     value = save_value;
14006                     prevvalue = save_prevvalue;
14007                     continue;
14008                 }
14009             }
14010         }
14011
14012         /* Deal with this element of the class */
14013         if (! SIZE_ONLY) {
14014 #ifndef EBCDIC
14015             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14016                                                      prevvalue, value);
14017 #else
14018             SV* this_range = _new_invlist(1);
14019             _append_range_to_invlist(this_range, prevvalue, value);
14020
14021             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14022              * If this range was specified using something like 'i-j', we want
14023              * to include only the 'i' and the 'j', and not anything in
14024              * between, so exclude non-ASCII, non-alphabetics from it.
14025              * However, if the range was specified with something like
14026              * [\x89-\x91] or [\x89-j], all code points within it should be
14027              * included.  literal_endpoint==2 means both ends of the range used
14028              * a literal character, not \x{foo} */
14029             if (literal_endpoint == 2
14030                 && ((prevvalue >= 'a' && value <= 'z')
14031                     || (prevvalue >= 'A' && value <= 'Z')))
14032             {
14033                 _invlist_intersection(this_range, PL_ASCII,
14034                                       &this_range);
14035
14036                 /* Since this above only contains ascii, the intersection of it
14037                  * with anything will still yield only ascii */
14038                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14039                                       &this_range);
14040             }
14041             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14042             literal_endpoint = 0;
14043 #endif
14044         }
14045
14046         range = 0; /* this range (if it was one) is done now */
14047     } /* End of loop through all the text within the brackets */
14048
14049     /* If anything in the class expands to more than one character, we have to
14050      * deal with them by building up a substitute parse string, and recursively
14051      * calling reg() on it, instead of proceeding */
14052     if (multi_char_matches) {
14053         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14054         I32 cp_count;
14055         STRLEN len;
14056         char *save_end = RExC_end;
14057         char *save_parse = RExC_parse;
14058         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14059                                        a "|" */
14060         I32 reg_flags;
14061
14062         assert(! invert);
14063 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14064            because too confusing */
14065         if (invert) {
14066             sv_catpv(substitute_parse, "(?:");
14067         }
14068 #endif
14069
14070         /* Look at the longest folds first */
14071         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14072
14073             if (av_exists(multi_char_matches, cp_count)) {
14074                 AV** this_array_ptr;
14075                 SV* this_sequence;
14076
14077                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14078                                                  cp_count, FALSE);
14079                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14080                                                                 &PL_sv_undef)
14081                 {
14082                     if (! first_time) {
14083                         sv_catpv(substitute_parse, "|");
14084                     }
14085                     first_time = FALSE;
14086
14087                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14088                 }
14089             }
14090         }
14091
14092         /* If the character class contains anything else besides these
14093          * multi-character folds, have to include it in recursive parsing */
14094         if (element_count) {
14095             sv_catpv(substitute_parse, "|[");
14096             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14097             sv_catpv(substitute_parse, "]");
14098         }
14099
14100         sv_catpv(substitute_parse, ")");
14101 #if 0
14102         if (invert) {
14103             /* This is a way to get the parse to skip forward a whole named
14104              * sequence instead of matching the 2nd character when it fails the
14105              * first */
14106             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14107         }
14108 #endif
14109
14110         RExC_parse = SvPV(substitute_parse, len);
14111         RExC_end = RExC_parse + len;
14112         RExC_in_multi_char_class = 1;
14113         RExC_emit = (regnode *)orig_emit;
14114
14115         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14116
14117         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14118
14119         RExC_parse = save_parse;
14120         RExC_end = save_end;
14121         RExC_in_multi_char_class = 0;
14122         SvREFCNT_dec_NN(multi_char_matches);
14123         return ret;
14124     }
14125
14126     /* Here, we've gone through the entire class and dealt with multi-char
14127      * folds.  We are now in a position that we can do some checks to see if we
14128      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14129      * Currently we only do two checks:
14130      * 1) is in the unlikely event that the user has specified both, eg. \w and
14131      *    \W under /l, then the class matches everything.  (This optimization
14132      *    is done only to make the optimizer code run later work.)
14133      * 2) if the character class contains only a single element (including a
14134      *    single range), we see if there is an equivalent node for it.
14135      * Other checks are possible */
14136     if (! ret_invlist   /* Can't optimize if returning the constructed
14137                            inversion list */
14138         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14139     {
14140         U8 op = END;
14141         U8 arg = 0;
14142
14143         if (UNLIKELY(posixl_matches_all)) {
14144             op = SANY;
14145         }
14146         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14147                                                    \w or [:digit:] or \p{foo}
14148                                                  */
14149
14150             /* All named classes are mapped into POSIXish nodes, with its FLAG
14151              * argument giving which class it is */
14152             switch ((I32)namedclass) {
14153                 case ANYOF_UNIPROP:
14154                     break;
14155
14156                 /* These don't depend on the charset modifiers.  They always
14157                  * match under /u rules */
14158                 case ANYOF_NHORIZWS:
14159                 case ANYOF_HORIZWS:
14160                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14161                     /* FALLTHROUGH */
14162
14163                 case ANYOF_NVERTWS:
14164                 case ANYOF_VERTWS:
14165                     op = POSIXU;
14166                     goto join_posix;
14167
14168                 /* The actual POSIXish node for all the rest depends on the
14169                  * charset modifier.  The ones in the first set depend only on
14170                  * ASCII or, if available on this platform, locale */
14171                 case ANYOF_ASCII:
14172                 case ANYOF_NASCII:
14173 #ifdef HAS_ISASCII
14174                     op = (LOC) ? POSIXL : POSIXA;
14175 #else
14176                     op = POSIXA;
14177 #endif
14178                     goto join_posix;
14179
14180                 case ANYOF_NCASED:
14181                 case ANYOF_LOWER:
14182                 case ANYOF_NLOWER:
14183                 case ANYOF_UPPER:
14184                 case ANYOF_NUPPER:
14185                     /* under /a could be alpha */
14186                     if (FOLD) {
14187                         if (ASCII_RESTRICTED) {
14188                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14189                         }
14190                         else if (! LOC) {
14191                             break;
14192                         }
14193                     }
14194                     /* FALLTHROUGH */
14195
14196                 /* The rest have more possibilities depending on the charset.
14197                  * We take advantage of the enum ordering of the charset
14198                  * modifiers to get the exact node type, */
14199                 default:
14200                     op = POSIXD + get_regex_charset(RExC_flags);
14201                     if (op > POSIXA) { /* /aa is same as /a */
14202                         op = POSIXA;
14203                     }
14204
14205                 join_posix:
14206                     /* The odd numbered ones are the complements of the
14207                      * next-lower even number one */
14208                     if (namedclass % 2 == 1) {
14209                         invert = ! invert;
14210                         namedclass--;
14211                     }
14212                     arg = namedclass_to_classnum(namedclass);
14213                     break;
14214             }
14215         }
14216         else if (value == prevvalue) {
14217
14218             /* Here, the class consists of just a single code point */
14219
14220             if (invert) {
14221                 if (! LOC && value == '\n') {
14222                     op = REG_ANY; /* Optimize [^\n] */
14223                     *flagp |= HASWIDTH|SIMPLE;
14224                     RExC_naughty++;
14225                 }
14226             }
14227             else if (value < 256 || UTF) {
14228
14229                 /* Optimize a single value into an EXACTish node, but not if it
14230                  * would require converting the pattern to UTF-8. */
14231                 op = compute_EXACTish(pRExC_state);
14232             }
14233         } /* Otherwise is a range */
14234         else if (! LOC) {   /* locale could vary these */
14235             if (prevvalue == '0') {
14236                 if (value == '9') {
14237                     arg = _CC_DIGIT;
14238                     op = POSIXA;
14239                 }
14240             }
14241         }
14242
14243         /* Here, we have changed <op> away from its initial value iff we found
14244          * an optimization */
14245         if (op != END) {
14246
14247             /* Throw away this ANYOF regnode, and emit the calculated one,
14248              * which should correspond to the beginning, not current, state of
14249              * the parse */
14250             const char * cur_parse = RExC_parse;
14251             RExC_parse = (char *)orig_parse;
14252             if ( SIZE_ONLY) {
14253                 if (! LOC) {
14254
14255                     /* To get locale nodes to not use the full ANYOF size would
14256                      * require moving the code above that writes the portions
14257                      * of it that aren't in other nodes to after this point.
14258                      * e.g.  ANYOF_POSIXL_SET */
14259                     RExC_size = orig_size;
14260                 }
14261             }
14262             else {
14263                 RExC_emit = (regnode *)orig_emit;
14264                 if (PL_regkind[op] == POSIXD) {
14265                     if (op == POSIXL) {
14266                         RExC_contains_locale = 1;
14267                     }
14268                     if (invert) {
14269                         op += NPOSIXD - POSIXD;
14270                     }
14271                 }
14272             }
14273
14274             ret = reg_node(pRExC_state, op);
14275
14276             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14277                 if (! SIZE_ONLY) {
14278                     FLAGS(ret) = arg;
14279                 }
14280                 *flagp |= HASWIDTH|SIMPLE;
14281             }
14282             else if (PL_regkind[op] == EXACT) {
14283                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14284                                            TRUE /* downgradable to EXACT */
14285                                            );
14286             }
14287
14288             RExC_parse = (char *) cur_parse;
14289
14290             SvREFCNT_dec(posixes);
14291             SvREFCNT_dec(nposixes);
14292             SvREFCNT_dec(cp_list);
14293             SvREFCNT_dec(cp_foldable_list);
14294             return ret;
14295         }
14296     }
14297
14298     if (SIZE_ONLY)
14299         return ret;
14300     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14301
14302     /* If folding, we calculate all characters that could fold to or from the
14303      * ones already on the list */
14304     if (cp_foldable_list) {
14305         if (FOLD) {
14306             UV start, end;      /* End points of code point ranges */
14307
14308             SV* fold_intersection = NULL;
14309             SV** use_list;
14310
14311             /* Our calculated list will be for Unicode rules.  For locale
14312              * matching, we have to keep a separate list that is consulted at
14313              * runtime only when the locale indicates Unicode rules.  For
14314              * non-locale, we just use to the general list */
14315             if (LOC) {
14316                 use_list = &only_utf8_locale_list;
14317             }
14318             else {
14319                 use_list = &cp_list;
14320             }
14321
14322             /* Only the characters in this class that participate in folds need
14323              * be checked.  Get the intersection of this class and all the
14324              * possible characters that are foldable.  This can quickly narrow
14325              * down a large class */
14326             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14327                                   &fold_intersection);
14328
14329             /* The folds for all the Latin1 characters are hard-coded into this
14330              * program, but we have to go out to disk to get the others. */
14331             if (invlist_highest(cp_foldable_list) >= 256) {
14332
14333                 /* This is a hash that for a particular fold gives all
14334                  * characters that are involved in it */
14335                 if (! PL_utf8_foldclosures) {
14336
14337                     /* If the folds haven't been read in, call a fold function
14338                      * to force that */
14339                     if (! PL_utf8_tofold) {
14340                         U8 dummy[UTF8_MAXBYTES_CASE+1];
14341
14342                         /* This string is just a short named one above \xff */
14343                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14344                         assert(PL_utf8_tofold); /* Verify that worked */
14345                     }
14346                     PL_utf8_foldclosures
14347                                       = _swash_inversion_hash(PL_utf8_tofold);
14348                 }
14349             }
14350
14351             /* Now look at the foldable characters in this class individually */
14352             invlist_iterinit(fold_intersection);
14353             while (invlist_iternext(fold_intersection, &start, &end)) {
14354                 UV j;
14355
14356                 /* Look at every character in the range */
14357                 for (j = start; j <= end; j++) {
14358                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14359                     STRLEN foldlen;
14360                     SV** listp;
14361
14362                     if (j < 256) {
14363
14364                         /* We have the latin1 folding rules hard-coded here so
14365                          * that an innocent-looking character class, like
14366                          * /[ks]/i won't have to go out to disk to find the
14367                          * possible matches.  XXX It would be better to
14368                          * generate these via regen, in case a new version of
14369                          * the Unicode standard adds new mappings, though that
14370                          * is not really likely, and may be caught by the
14371                          * default: case of the switch below. */
14372
14373                         if (IS_IN_SOME_FOLD_L1(j)) {
14374
14375                             /* ASCII is always matched; non-ASCII is matched
14376                              * only under Unicode rules (which could happen
14377                              * under /l if the locale is a UTF-8 one */
14378                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14379                                 *use_list = add_cp_to_invlist(*use_list,
14380                                                             PL_fold_latin1[j]);
14381                             }
14382                             else {
14383                                 depends_list =
14384                                  add_cp_to_invlist(depends_list,
14385                                                    PL_fold_latin1[j]);
14386                             }
14387                         }
14388
14389                         if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14390                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14391                         {
14392                             /* Certain Latin1 characters have matches outside
14393                             * Latin1.  To get here, <j> is one of those
14394                             * characters.   None of these matches is valid for
14395                             * ASCII characters under /aa, which is why the 'if'
14396                             * just above excludes those.  These matches only
14397                             * happen when the target string is utf8.  The code
14398                             * below adds the single fold closures for <j> to the
14399                             * inversion list. */
14400
14401                             switch (j) {
14402                                 case 'k':
14403                                 case 'K':
14404                                   *use_list =
14405                                      add_cp_to_invlist(*use_list, KELVIN_SIGN);
14406                                     break;
14407                                 case 's':
14408                                 case 'S':
14409                                   *use_list = add_cp_to_invlist(*use_list,
14410                                                     LATIN_SMALL_LETTER_LONG_S);
14411                                     break;
14412                                 case MICRO_SIGN:
14413                                   *use_list = add_cp_to_invlist(*use_list,
14414                                                       GREEK_CAPITAL_LETTER_MU);
14415                                   *use_list = add_cp_to_invlist(*use_list,
14416                                                         GREEK_SMALL_LETTER_MU);
14417                                     break;
14418                                 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14419                                 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14420                                   *use_list =
14421                                    add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14422                                     break;
14423                                 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14424                                   *use_list = add_cp_to_invlist(*use_list,
14425                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14426                                     break;
14427                                 case LATIN_SMALL_LETTER_SHARP_S:
14428                                   *use_list = add_cp_to_invlist(*use_list,
14429                                                  LATIN_CAPITAL_LETTER_SHARP_S);
14430                                     break;
14431                                 case 'F': case 'f':
14432                                 case 'I': case 'i':
14433                                 case 'L': case 'l':
14434                                 case 'T': case 't':
14435                                 case 'A': case 'a':
14436                                 case 'H': case 'h':
14437                                 case 'J': case 'j':
14438                                 case 'N': case 'n':
14439                                 case 'W': case 'w':
14440                                 case 'Y': case 'y':
14441                                     /* These all are targets of multi-character
14442                                      * folds from code points that require UTF8
14443                                      * to express, so they can't match unless
14444                                      * the target string is in UTF-8, so no
14445                                      * action here is necessary, as regexec.c
14446                                      * properly handles the general case for
14447                                      * UTF-8 matching and multi-char folds */
14448                                     break;
14449                                 default:
14450                                     /* Use deprecated warning to increase the
14451                                     * chances of this being output */
14452                                     ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14453                                     break;
14454                             }
14455                         }
14456                         continue;
14457                     }
14458
14459                     /* Here is an above Latin1 character.  We don't have the
14460                      * rules hard-coded for it.  First, get its fold.  This is
14461                      * the simple fold, as the multi-character folds have been
14462                      * handled earlier and separated out */
14463                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14464                                                         (ASCII_FOLD_RESTRICTED)
14465                                                         ? FOLD_FLAGS_NOMIX_ASCII
14466                                                         : 0);
14467
14468                     /* Single character fold of above Latin1.  Add everything in
14469                     * its fold closure to the list that this node should match.
14470                     * The fold closures data structure is a hash with the keys
14471                     * being the UTF-8 of every character that is folded to, like
14472                     * 'k', and the values each an array of all code points that
14473                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14474                     * Multi-character folds are not included */
14475                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14476                                         (char *) foldbuf, foldlen, FALSE)))
14477                     {
14478                         AV* list = (AV*) *listp;
14479                         IV k;
14480                         for (k = 0; k <= av_tindex(list); k++) {
14481                             SV** c_p = av_fetch(list, k, FALSE);
14482                             UV c;
14483                             if (c_p == NULL) {
14484                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14485                             }
14486                             c = SvUV(*c_p);
14487
14488                             /* /aa doesn't allow folds between ASCII and non- */
14489                             if ((ASCII_FOLD_RESTRICTED
14490                                 && (isASCII(c) != isASCII(j))))
14491                             {
14492                                 continue;
14493                             }
14494
14495                             /* Folds under /l which cross the 255/256 boundary
14496                              * are added to a separate list.  (These are valid
14497                              * only when the locale is UTF-8.) */
14498                             if (c < 256 && LOC) {
14499                                 *use_list = add_cp_to_invlist(*use_list, c);
14500                                 continue;
14501                             }
14502
14503                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14504                             {
14505                                 cp_list = add_cp_to_invlist(cp_list, c);
14506                             }
14507                             else {
14508                                 /* Similarly folds involving non-ascii Latin1
14509                                 * characters under /d are added to their list */
14510                                 depends_list = add_cp_to_invlist(depends_list,
14511                                                                  c);
14512                             }
14513                         }
14514                     }
14515                 }
14516             }
14517             SvREFCNT_dec_NN(fold_intersection);
14518         }
14519
14520         /* Now that we have finished adding all the folds, there is no reason
14521          * to keep the foldable list separate */
14522         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14523         SvREFCNT_dec_NN(cp_foldable_list);
14524     }
14525
14526     /* And combine the result (if any) with any inversion list from posix
14527      * classes.  The lists are kept separate up to now because we don't want to
14528      * fold the classes (folding of those is automatically handled by the swash
14529      * fetching code) */
14530     if (posixes || nposixes) {
14531         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14532             /* Under /a and /aa, nothing above ASCII matches these */
14533             _invlist_intersection(posixes,
14534                                   PL_XPosix_ptrs[_CC_ASCII],
14535                                   &posixes);
14536         }
14537         if (nposixes) {
14538             if (DEPENDS_SEMANTICS) {
14539                 /* Under /d, everything in the upper half of the Latin1 range
14540                  * matches these complements */
14541                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14542             }
14543             else if (AT_LEAST_ASCII_RESTRICTED) {
14544                 /* Under /a and /aa, everything above ASCII matches these
14545                  * complements */
14546                 _invlist_union_complement_2nd(nposixes,
14547                                               PL_XPosix_ptrs[_CC_ASCII],
14548                                               &nposixes);
14549             }
14550             if (posixes) {
14551                 _invlist_union(posixes, nposixes, &posixes);
14552                 SvREFCNT_dec_NN(nposixes);
14553             }
14554             else {
14555                 posixes = nposixes;
14556             }
14557         }
14558         if (! DEPENDS_SEMANTICS) {
14559             if (cp_list) {
14560                 _invlist_union(cp_list, posixes, &cp_list);
14561                 SvREFCNT_dec_NN(posixes);
14562             }
14563             else {
14564                 cp_list = posixes;
14565             }
14566         }
14567         else {
14568             /* Under /d, we put into a separate list the Latin1 things that
14569              * match only when the target string is utf8 */
14570             SV* nonascii_but_latin1_properties = NULL;
14571             _invlist_intersection(posixes, PL_UpperLatin1,
14572                                   &nonascii_but_latin1_properties);
14573             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14574                               &posixes);
14575             if (cp_list) {
14576                 _invlist_union(cp_list, posixes, &cp_list);
14577                 SvREFCNT_dec_NN(posixes);
14578             }
14579             else {
14580                 cp_list = posixes;
14581             }
14582
14583             if (depends_list) {
14584                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14585                                &depends_list);
14586                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14587             }
14588             else {
14589                 depends_list = nonascii_but_latin1_properties;
14590             }
14591         }
14592     }
14593
14594     /* And combine the result (if any) with any inversion list from properties.
14595      * The lists are kept separate up to now so that we can distinguish the two
14596      * in regards to matching above-Unicode.  A run-time warning is generated
14597      * if a Unicode property is matched against a non-Unicode code point. But,
14598      * we allow user-defined properties to match anything, without any warning,
14599      * and we also suppress the warning if there is a portion of the character
14600      * class that isn't a Unicode property, and which matches above Unicode, \W
14601      * or [\x{110000}] for example.
14602      * (Note that in this case, unlike the Posix one above, there is no
14603      * <depends_list>, because having a Unicode property forces Unicode
14604      * semantics */
14605     if (properties) {
14606         if (cp_list) {
14607
14608             /* If it matters to the final outcome, see if a non-property
14609              * component of the class matches above Unicode.  If so, the
14610              * warning gets suppressed.  This is true even if just a single
14611              * such code point is specified, as though not strictly correct if
14612              * another such code point is matched against, the fact that they
14613              * are using above-Unicode code points indicates they should know
14614              * the issues involved */
14615             if (warn_super) {
14616                 warn_super = ! (invert
14617                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14618             }
14619
14620             _invlist_union(properties, cp_list, &cp_list);
14621             SvREFCNT_dec_NN(properties);
14622         }
14623         else {
14624             cp_list = properties;
14625         }
14626
14627         if (warn_super) {
14628             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14629         }
14630     }
14631
14632     /* Here, we have calculated what code points should be in the character
14633      * class.
14634      *
14635      * Now we can see about various optimizations.  Fold calculation (which we
14636      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14637      * would invert to include K, which under /i would match k, which it
14638      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14639      * folded until runtime */
14640
14641     /* If we didn't do folding, it's because some information isn't available
14642      * until runtime; set the run-time fold flag for these.  (We don't have to
14643      * worry about properties folding, as that is taken care of by the swash
14644      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14645      * locales, or the class matches at least one 0-255 range code point */
14646     if (LOC && FOLD) {
14647         if (only_utf8_locale_list) {
14648             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14649         }
14650         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14651                                the list */
14652             UV start, end;
14653             invlist_iterinit(cp_list);
14654             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14655                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14656             }
14657             invlist_iterfinish(cp_list);
14658         }
14659     }
14660
14661     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14662      * at compile time.  Besides not inverting folded locale now, we can't
14663      * invert if there are things such as \w, which aren't known until runtime
14664      * */
14665     if (invert
14666         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14667         && ! depends_list
14668         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14669     {
14670         _invlist_invert(cp_list);
14671
14672         /* Any swash can't be used as-is, because we've inverted things */
14673         if (swash) {
14674             SvREFCNT_dec_NN(swash);
14675             swash = NULL;
14676         }
14677
14678         /* Clear the invert flag since have just done it here */
14679         invert = FALSE;
14680     }
14681
14682     if (ret_invlist) {
14683         *ret_invlist = cp_list;
14684         SvREFCNT_dec(swash);
14685
14686         /* Discard the generated node */
14687         if (SIZE_ONLY) {
14688             RExC_size = orig_size;
14689         }
14690         else {
14691             RExC_emit = orig_emit;
14692         }
14693         return orig_emit;
14694     }
14695
14696     /* Some character classes are equivalent to other nodes.  Such nodes take
14697      * up less room and generally fewer operations to execute than ANYOF nodes.
14698      * Above, we checked for and optimized into some such equivalents for
14699      * certain common classes that are easy to test.  Getting to this point in
14700      * the code means that the class didn't get optimized there.  Since this
14701      * code is only executed in Pass 2, it is too late to save space--it has
14702      * been allocated in Pass 1, and currently isn't given back.  But turning
14703      * things into an EXACTish node can allow the optimizer to join it to any
14704      * adjacent such nodes.  And if the class is equivalent to things like /./,
14705      * expensive run-time swashes can be avoided.  Now that we have more
14706      * complete information, we can find things necessarily missed by the
14707      * earlier code.  I (khw) am not sure how much to look for here.  It would
14708      * be easy, but perhaps too slow, to check any candidates against all the
14709      * node types they could possibly match using _invlistEQ(). */
14710
14711     if (cp_list
14712         && ! invert
14713         && ! depends_list
14714         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14715         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14716
14717            /* We don't optimize if we are supposed to make sure all non-Unicode
14718             * code points raise a warning, as only ANYOF nodes have this check.
14719             * */
14720         && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14721     {
14722         UV start, end;
14723         U8 op = END;  /* The optimzation node-type */
14724         const char * cur_parse= RExC_parse;
14725
14726         invlist_iterinit(cp_list);
14727         if (! invlist_iternext(cp_list, &start, &end)) {
14728
14729             /* Here, the list is empty.  This happens, for example, when a
14730              * Unicode property is the only thing in the character class, and
14731              * it doesn't match anything.  (perluniprops.pod notes such
14732              * properties) */
14733             op = OPFAIL;
14734             *flagp |= HASWIDTH|SIMPLE;
14735         }
14736         else if (start == end) {    /* The range is a single code point */
14737             if (! invlist_iternext(cp_list, &start, &end)
14738
14739                     /* Don't do this optimization if it would require changing
14740                      * the pattern to UTF-8 */
14741                 && (start < 256 || UTF))
14742             {
14743                 /* Here, the list contains a single code point.  Can optimize
14744                  * into an EXACTish node */
14745
14746                 value = start;
14747
14748                 if (! FOLD) {
14749                     op = EXACT;
14750                 }
14751                 else if (LOC) {
14752
14753                     /* A locale node under folding with one code point can be
14754                      * an EXACTFL, as its fold won't be calculated until
14755                      * runtime */
14756                     op = EXACTFL;
14757                 }
14758                 else {
14759
14760                     /* Here, we are generally folding, but there is only one
14761                      * code point to match.  If we have to, we use an EXACT
14762                      * node, but it would be better for joining with adjacent
14763                      * nodes in the optimization pass if we used the same
14764                      * EXACTFish node that any such are likely to be.  We can
14765                      * do this iff the code point doesn't participate in any
14766                      * folds.  For example, an EXACTF of a colon is the same as
14767                      * an EXACT one, since nothing folds to or from a colon. */
14768                     if (value < 256) {
14769                         if (IS_IN_SOME_FOLD_L1(value)) {
14770                             op = EXACT;
14771                         }
14772                     }
14773                     else {
14774                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14775                             op = EXACT;
14776                         }
14777                     }
14778
14779                     /* If we haven't found the node type, above, it means we
14780                      * can use the prevailing one */
14781                     if (op == END) {
14782                         op = compute_EXACTish(pRExC_state);
14783                     }
14784                 }
14785             }
14786         }
14787         else if (start == 0) {
14788             if (end == UV_MAX) {
14789                 op = SANY;
14790                 *flagp |= HASWIDTH|SIMPLE;
14791                 RExC_naughty++;
14792             }
14793             else if (end == '\n' - 1
14794                     && invlist_iternext(cp_list, &start, &end)
14795                     && start == '\n' + 1 && end == UV_MAX)
14796             {
14797                 op = REG_ANY;
14798                 *flagp |= HASWIDTH|SIMPLE;
14799                 RExC_naughty++;
14800             }
14801         }
14802         invlist_iterfinish(cp_list);
14803
14804         if (op != END) {
14805             RExC_parse = (char *)orig_parse;
14806             RExC_emit = (regnode *)orig_emit;
14807
14808             ret = reg_node(pRExC_state, op);
14809
14810             RExC_parse = (char *)cur_parse;
14811
14812             if (PL_regkind[op] == EXACT) {
14813                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14814                                            TRUE /* downgradable to EXACT */
14815                                           );
14816             }
14817
14818             SvREFCNT_dec_NN(cp_list);
14819             return ret;
14820         }
14821     }
14822
14823     /* Here, <cp_list> contains all the code points we can determine at
14824      * compile time that match under all conditions.  Go through it, and
14825      * for things that belong in the bitmap, put them there, and delete from
14826      * <cp_list>.  While we are at it, see if everything above 255 is in the
14827      * list, and if so, set a flag to speed up execution */
14828
14829     populate_ANYOF_from_invlist(ret, &cp_list);
14830
14831     if (invert) {
14832         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14833     }
14834
14835     /* Here, the bitmap has been populated with all the Latin1 code points that
14836      * always match.  Can now add to the overall list those that match only
14837      * when the target string is UTF-8 (<depends_list>). */
14838     if (depends_list) {
14839         if (cp_list) {
14840             _invlist_union(cp_list, depends_list, &cp_list);
14841             SvREFCNT_dec_NN(depends_list);
14842         }
14843         else {
14844             cp_list = depends_list;
14845         }
14846         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14847     }
14848
14849     /* If there is a swash and more than one element, we can't use the swash in
14850      * the optimization below. */
14851     if (swash && element_count > 1) {
14852         SvREFCNT_dec_NN(swash);
14853         swash = NULL;
14854     }
14855
14856     set_ANYOF_arg(pRExC_state, ret, cp_list,
14857                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14858                    ? listsv : NULL,
14859                   only_utf8_locale_list,
14860                   swash, has_user_defined_property);
14861
14862     *flagp |= HASWIDTH|SIMPLE;
14863
14864     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14865         RExC_contains_locale = 1;
14866     }
14867
14868     return ret;
14869 }
14870
14871 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14872
14873 STATIC void
14874 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14875                 regnode* const node,
14876                 SV* const cp_list,
14877                 SV* const runtime_defns,
14878                 SV* const only_utf8_locale_list,
14879                 SV* const swash,
14880                 const bool has_user_defined_property)
14881 {
14882     /* Sets the arg field of an ANYOF-type node 'node', using information about
14883      * the node passed-in.  If there is nothing outside the node's bitmap, the
14884      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14885      * the count returned by add_data(), having allocated and stored an array,
14886      * av, that that count references, as follows:
14887      *  av[0] stores the character class description in its textual form.
14888      *        This is used later (regexec.c:Perl_regclass_swash()) to
14889      *        initialize the appropriate swash, and is also useful for dumping
14890      *        the regnode.  This is set to &PL_sv_undef if the textual
14891      *        description is not needed at run-time (as happens if the other
14892      *        elements completely define the class)
14893      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14894      *        computed from av[0].  But if no further computation need be done,
14895      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14896      *  av[2] stores the inversion list of code points that match only if the
14897      *        current locale is UTF-8
14898      *  av[3] stores the cp_list inversion list for use in addition or instead
14899      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14900      *        (Otherwise everything needed is already in av[0] and av[1])
14901      *  av[4] is set if any component of the class is from a user-defined
14902      *        property; used only if av[3] exists */
14903
14904     UV n;
14905
14906     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14907
14908     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14909         assert(! (ANYOF_FLAGS(node)
14910                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14911         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14912     }
14913     else {
14914         AV * const av = newAV();
14915         SV *rv;
14916
14917         assert(ANYOF_FLAGS(node)
14918                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14919
14920         av_store(av, 0, (runtime_defns)
14921                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14922         if (swash) {
14923             av_store(av, 1, swash);
14924             SvREFCNT_dec_NN(cp_list);
14925         }
14926         else {
14927             av_store(av, 1, &PL_sv_undef);
14928             if (cp_list) {
14929                 av_store(av, 3, cp_list);
14930                 av_store(av, 4, newSVuv(has_user_defined_property));
14931             }
14932         }
14933
14934         if (only_utf8_locale_list) {
14935             av_store(av, 2, only_utf8_locale_list);
14936         }
14937         else {
14938             av_store(av, 2, &PL_sv_undef);
14939         }
14940
14941         rv = newRV_noinc(MUTABLE_SV(av));
14942         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14943         RExC_rxi->data->data[n] = (void*)rv;
14944         ARG_SET(node, n);
14945     }
14946 }
14947
14948
14949 /* reg_skipcomment()
14950
14951    Absorbs an /x style # comments from the input stream.
14952    Returns true if there is more text remaining in the stream.
14953    Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14954    terminates the pattern without including a newline.
14955
14956    Note its the callers responsibility to ensure that we are
14957    actually in /x mode
14958
14959 */
14960
14961 STATIC bool
14962 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14963 {
14964     bool ended = 0;
14965
14966     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14967
14968     while (RExC_parse < RExC_end)
14969         if (*RExC_parse++ == '\n') {
14970             ended = 1;
14971             break;
14972         }
14973     if (!ended) {
14974         /* we ran off the end of the pattern without ending
14975            the comment, so we have to add an \n when wrapping */
14976         RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
14977         return 0;
14978     } else
14979         return 1;
14980 }
14981
14982 /* nextchar()
14983
14984    Advances the parse position, and optionally absorbs
14985    "whitespace" from the inputstream.
14986
14987    Without /x "whitespace" means (?#...) style comments only,
14988    with /x this means (?#...) and # comments and whitespace proper.
14989
14990    Returns the RExC_parse point from BEFORE the scan occurs.
14991
14992    This is the /x friendly way of saying RExC_parse++.
14993 */
14994
14995 STATIC char*
14996 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14997 {
14998     char* const retval = RExC_parse++;
14999
15000     PERL_ARGS_ASSERT_NEXTCHAR;
15001
15002     for (;;) {
15003         if (RExC_end - RExC_parse >= 3
15004             && *RExC_parse == '('
15005             && RExC_parse[1] == '?'
15006             && RExC_parse[2] == '#')
15007         {
15008             while (*RExC_parse != ')') {
15009                 if (RExC_parse == RExC_end)
15010                     FAIL("Sequence (?#... not terminated");
15011                 RExC_parse++;
15012             }
15013             RExC_parse++;
15014             continue;
15015         }
15016         if (RExC_flags & RXf_PMf_EXTENDED) {
15017             if (isSPACE(*RExC_parse)) {
15018                 RExC_parse++;
15019                 continue;
15020             }
15021             else if (*RExC_parse == '#') {
15022                 if ( reg_skipcomment( pRExC_state ) )
15023                     continue;
15024             }
15025         }
15026         return retval;
15027     }
15028 }
15029
15030 /*
15031 - reg_node - emit a node
15032 */
15033 STATIC regnode *                        /* Location. */
15034 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15035 {
15036     dVAR;
15037     regnode *ptr;
15038     regnode * const ret = RExC_emit;
15039     GET_RE_DEBUG_FLAGS_DECL;
15040
15041     PERL_ARGS_ASSERT_REG_NODE;
15042
15043     if (SIZE_ONLY) {
15044         SIZE_ALIGN(RExC_size);
15045         RExC_size += 1;
15046         return(ret);
15047     }
15048     if (RExC_emit >= RExC_emit_bound)
15049         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15050                    op, RExC_emit, RExC_emit_bound);
15051
15052     NODE_ALIGN_FILL(ret);
15053     ptr = ret;
15054     FILL_ADVANCE_NODE(ptr, op);
15055 #ifdef RE_TRACK_PATTERN_OFFSETS
15056     if (RExC_offsets) {         /* MJD */
15057         MJD_OFFSET_DEBUG(
15058               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15059               "reg_node", __LINE__,
15060               PL_reg_name[op],
15061               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15062                 ? "Overwriting end of array!\n" : "OK",
15063               (UV)(RExC_emit - RExC_emit_start),
15064               (UV)(RExC_parse - RExC_start),
15065               (UV)RExC_offsets[0]));
15066         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15067     }
15068 #endif
15069     RExC_emit = ptr;
15070     return(ret);
15071 }
15072
15073 /*
15074 - reganode - emit a node with an argument
15075 */
15076 STATIC regnode *                        /* Location. */
15077 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15078 {
15079     dVAR;
15080     regnode *ptr;
15081     regnode * const ret = RExC_emit;
15082     GET_RE_DEBUG_FLAGS_DECL;
15083
15084     PERL_ARGS_ASSERT_REGANODE;
15085
15086     if (SIZE_ONLY) {
15087         SIZE_ALIGN(RExC_size);
15088         RExC_size += 2;
15089         /*
15090            We can't do this:
15091
15092            assert(2==regarglen[op]+1);
15093
15094            Anything larger than this has to allocate the extra amount.
15095            If we changed this to be:
15096
15097            RExC_size += (1 + regarglen[op]);
15098
15099            then it wouldn't matter. Its not clear what side effect
15100            might come from that so its not done so far.
15101            -- dmq
15102         */
15103         return(ret);
15104     }
15105     if (RExC_emit >= RExC_emit_bound)
15106         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15107                    op, RExC_emit, RExC_emit_bound);
15108
15109     NODE_ALIGN_FILL(ret);
15110     ptr = ret;
15111     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15112 #ifdef RE_TRACK_PATTERN_OFFSETS
15113     if (RExC_offsets) {         /* MJD */
15114         MJD_OFFSET_DEBUG(
15115               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15116               "reganode",
15117               __LINE__,
15118               PL_reg_name[op],
15119               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15120               "Overwriting end of array!\n" : "OK",
15121               (UV)(RExC_emit - RExC_emit_start),
15122               (UV)(RExC_parse - RExC_start),
15123               (UV)RExC_offsets[0]));
15124         Set_Cur_Node_Offset;
15125     }
15126 #endif
15127     RExC_emit = ptr;
15128     return(ret);
15129 }
15130
15131 /*
15132 - reguni - emit (if appropriate) a Unicode character
15133 */
15134 PERL_STATIC_INLINE STRLEN
15135 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15136 {
15137     dVAR;
15138
15139     PERL_ARGS_ASSERT_REGUNI;
15140
15141     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15142 }
15143
15144 /*
15145 - reginsert - insert an operator in front of already-emitted operand
15146 *
15147 * Means relocating the operand.
15148 */
15149 STATIC void
15150 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15151 {
15152     dVAR;
15153     regnode *src;
15154     regnode *dst;
15155     regnode *place;
15156     const int offset = regarglen[(U8)op];
15157     const int size = NODE_STEP_REGNODE + offset;
15158     GET_RE_DEBUG_FLAGS_DECL;
15159
15160     PERL_ARGS_ASSERT_REGINSERT;
15161     PERL_UNUSED_ARG(depth);
15162 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15163     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15164     if (SIZE_ONLY) {
15165         RExC_size += size;
15166         return;
15167     }
15168
15169     src = RExC_emit;
15170     RExC_emit += size;
15171     dst = RExC_emit;
15172     if (RExC_open_parens) {
15173         int paren;
15174         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15175         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15176             if ( RExC_open_parens[paren] >= opnd ) {
15177                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15178                 RExC_open_parens[paren] += size;
15179             } else {
15180                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15181             }
15182             if ( RExC_close_parens[paren] >= opnd ) {
15183                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15184                 RExC_close_parens[paren] += size;
15185             } else {
15186                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15187             }
15188         }
15189     }
15190
15191     while (src > opnd) {
15192         StructCopy(--src, --dst, regnode);
15193 #ifdef RE_TRACK_PATTERN_OFFSETS
15194         if (RExC_offsets) {     /* MJD 20010112 */
15195             MJD_OFFSET_DEBUG(
15196                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15197                   "reg_insert",
15198                   __LINE__,
15199                   PL_reg_name[op],
15200                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15201                     ? "Overwriting end of array!\n" : "OK",
15202                   (UV)(src - RExC_emit_start),
15203                   (UV)(dst - RExC_emit_start),
15204                   (UV)RExC_offsets[0]));
15205             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15206             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15207         }
15208 #endif
15209     }
15210
15211
15212     place = opnd;               /* Op node, where operand used to be. */
15213 #ifdef RE_TRACK_PATTERN_OFFSETS
15214     if (RExC_offsets) {         /* MJD */
15215         MJD_OFFSET_DEBUG(
15216               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15217               "reginsert",
15218               __LINE__,
15219               PL_reg_name[op],
15220               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15221               ? "Overwriting end of array!\n" : "OK",
15222               (UV)(place - RExC_emit_start),
15223               (UV)(RExC_parse - RExC_start),
15224               (UV)RExC_offsets[0]));
15225         Set_Node_Offset(place, RExC_parse);
15226         Set_Node_Length(place, 1);
15227     }
15228 #endif
15229     src = NEXTOPER(place);
15230     FILL_ADVANCE_NODE(place, op);
15231     Zero(src, offset, regnode);
15232 }
15233
15234 /*
15235 - regtail - set the next-pointer at the end of a node chain of p to val.
15236 - SEE ALSO: regtail_study
15237 */
15238 /* TODO: All three parms should be const */
15239 STATIC void
15240 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15241                 const regnode *val,U32 depth)
15242 {
15243     dVAR;
15244     regnode *scan;
15245     GET_RE_DEBUG_FLAGS_DECL;
15246
15247     PERL_ARGS_ASSERT_REGTAIL;
15248 #ifndef DEBUGGING
15249     PERL_UNUSED_ARG(depth);
15250 #endif
15251
15252     if (SIZE_ONLY)
15253         return;
15254
15255     /* Find last node. */
15256     scan = p;
15257     for (;;) {
15258         regnode * const temp = regnext(scan);
15259         DEBUG_PARSE_r({
15260             SV * const mysv=sv_newmortal();
15261             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15262             regprop(RExC_rx, mysv, scan, NULL);
15263             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15264                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15265                     (temp == NULL ? "->" : ""),
15266                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15267             );
15268         });
15269         if (temp == NULL)
15270             break;
15271         scan = temp;
15272     }
15273
15274     if (reg_off_by_arg[OP(scan)]) {
15275         ARG_SET(scan, val - scan);
15276     }
15277     else {
15278         NEXT_OFF(scan) = val - scan;
15279     }
15280 }
15281
15282 #ifdef DEBUGGING
15283 /*
15284 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15285 - Look for optimizable sequences at the same time.
15286 - currently only looks for EXACT chains.
15287
15288 This is experimental code. The idea is to use this routine to perform
15289 in place optimizations on branches and groups as they are constructed,
15290 with the long term intention of removing optimization from study_chunk so
15291 that it is purely analytical.
15292
15293 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15294 to control which is which.
15295
15296 */
15297 /* TODO: All four parms should be const */
15298
15299 STATIC U8
15300 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15301                       const regnode *val,U32 depth)
15302 {
15303     dVAR;
15304     regnode *scan;
15305     U8 exact = PSEUDO;
15306 #ifdef EXPERIMENTAL_INPLACESCAN
15307     I32 min = 0;
15308 #endif
15309     GET_RE_DEBUG_FLAGS_DECL;
15310
15311     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15312
15313
15314     if (SIZE_ONLY)
15315         return exact;
15316
15317     /* Find last node. */
15318
15319     scan = p;
15320     for (;;) {
15321         regnode * const temp = regnext(scan);
15322 #ifdef EXPERIMENTAL_INPLACESCAN
15323         if (PL_regkind[OP(scan)] == EXACT) {
15324             bool unfolded_multi_char;   /* Unexamined in this routine */
15325             if (join_exact(pRExC_state, scan, &min,
15326                            &unfolded_multi_char, 1, val, depth+1))
15327                 return EXACT;
15328         }
15329 #endif
15330         if ( exact ) {
15331             switch (OP(scan)) {
15332                 case EXACT:
15333                 case EXACTF:
15334                 case EXACTFA_NO_TRIE:
15335                 case EXACTFA:
15336                 case EXACTFU:
15337                 case EXACTFU_SS:
15338                 case EXACTFL:
15339                         if( exact == PSEUDO )
15340                             exact= OP(scan);
15341                         else if ( exact != OP(scan) )
15342                             exact= 0;
15343                 case NOTHING:
15344                     break;
15345                 default:
15346                     exact= 0;
15347             }
15348         }
15349         DEBUG_PARSE_r({
15350             SV * const mysv=sv_newmortal();
15351             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15352             regprop(RExC_rx, mysv, scan, NULL);
15353             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15354                 SvPV_nolen_const(mysv),
15355                 REG_NODE_NUM(scan),
15356                 PL_reg_name[exact]);
15357         });
15358         if (temp == NULL)
15359             break;
15360         scan = temp;
15361     }
15362     DEBUG_PARSE_r({
15363         SV * const mysv_val=sv_newmortal();
15364         DEBUG_PARSE_MSG("");
15365         regprop(RExC_rx, mysv_val, val, NULL);
15366         PerlIO_printf(Perl_debug_log,
15367                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15368                       SvPV_nolen_const(mysv_val),
15369                       (IV)REG_NODE_NUM(val),
15370                       (IV)(val - scan)
15371         );
15372     });
15373     if (reg_off_by_arg[OP(scan)]) {
15374         ARG_SET(scan, val - scan);
15375     }
15376     else {
15377         NEXT_OFF(scan) = val - scan;
15378     }
15379
15380     return exact;
15381 }
15382 #endif
15383
15384 /*
15385  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15386  */
15387 #ifdef DEBUGGING
15388
15389 static void
15390 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15391 {
15392     int bit;
15393     int set=0;
15394
15395     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15396
15397     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15398         if (flags & (1<<bit)) {
15399             if (!set++ && lead)
15400                 PerlIO_printf(Perl_debug_log, "%s",lead);
15401             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15402         }
15403     }
15404     if (lead)  {
15405         if (set)
15406             PerlIO_printf(Perl_debug_log, "\n");
15407         else
15408             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15409     }
15410 }
15411
15412 static void
15413 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15414 {
15415     int bit;
15416     int set=0;
15417     regex_charset cs;
15418
15419     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15420
15421     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15422         if (flags & (1<<bit)) {
15423             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15424                 continue;
15425             }
15426             if (!set++ && lead)
15427                 PerlIO_printf(Perl_debug_log, "%s",lead);
15428             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15429         }
15430     }
15431     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15432             if (!set++ && lead) {
15433                 PerlIO_printf(Perl_debug_log, "%s",lead);
15434             }
15435             switch (cs) {
15436                 case REGEX_UNICODE_CHARSET:
15437                     PerlIO_printf(Perl_debug_log, "UNICODE");
15438                     break;
15439                 case REGEX_LOCALE_CHARSET:
15440                     PerlIO_printf(Perl_debug_log, "LOCALE");
15441                     break;
15442                 case REGEX_ASCII_RESTRICTED_CHARSET:
15443                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15444                     break;
15445                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15446                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15447                     break;
15448                 default:
15449                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15450                     break;
15451             }
15452     }
15453     if (lead)  {
15454         if (set)
15455             PerlIO_printf(Perl_debug_log, "\n");
15456         else
15457             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15458     }
15459 }
15460 #endif
15461
15462 void
15463 Perl_regdump(pTHX_ const regexp *r)
15464 {
15465 #ifdef DEBUGGING
15466     dVAR;
15467     SV * const sv = sv_newmortal();
15468     SV *dsv= sv_newmortal();
15469     RXi_GET_DECL(r,ri);
15470     GET_RE_DEBUG_FLAGS_DECL;
15471
15472     PERL_ARGS_ASSERT_REGDUMP;
15473
15474     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15475
15476     /* Header fields of interest. */
15477     if (r->anchored_substr) {
15478         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15479             RE_SV_DUMPLEN(r->anchored_substr), 30);
15480         PerlIO_printf(Perl_debug_log,
15481                       "anchored %s%s at %"IVdf" ",
15482                       s, RE_SV_TAIL(r->anchored_substr),
15483                       (IV)r->anchored_offset);
15484     } else if (r->anchored_utf8) {
15485         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15486             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15487         PerlIO_printf(Perl_debug_log,
15488                       "anchored utf8 %s%s at %"IVdf" ",
15489                       s, RE_SV_TAIL(r->anchored_utf8),
15490                       (IV)r->anchored_offset);
15491     }
15492     if (r->float_substr) {
15493         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15494             RE_SV_DUMPLEN(r->float_substr), 30);
15495         PerlIO_printf(Perl_debug_log,
15496                       "floating %s%s at %"IVdf"..%"UVuf" ",
15497                       s, RE_SV_TAIL(r->float_substr),
15498                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15499     } else if (r->float_utf8) {
15500         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15501             RE_SV_DUMPLEN(r->float_utf8), 30);
15502         PerlIO_printf(Perl_debug_log,
15503                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15504                       s, RE_SV_TAIL(r->float_utf8),
15505                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15506     }
15507     if (r->check_substr || r->check_utf8)
15508         PerlIO_printf(Perl_debug_log,
15509                       (const char *)
15510                       (r->check_substr == r->float_substr
15511                        && r->check_utf8 == r->float_utf8
15512                        ? "(checking floating" : "(checking anchored"));
15513     if (r->intflags & PREGf_NOSCAN)
15514         PerlIO_printf(Perl_debug_log, " noscan");
15515     if (r->extflags & RXf_CHECK_ALL)
15516         PerlIO_printf(Perl_debug_log, " isall");
15517     if (r->check_substr || r->check_utf8)
15518         PerlIO_printf(Perl_debug_log, ") ");
15519
15520     if (ri->regstclass) {
15521         regprop(r, sv, ri->regstclass, NULL);
15522         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15523     }
15524     if (r->intflags & PREGf_ANCH) {
15525         PerlIO_printf(Perl_debug_log, "anchored");
15526         if (r->intflags & PREGf_ANCH_BOL)
15527             PerlIO_printf(Perl_debug_log, "(BOL)");
15528         if (r->intflags & PREGf_ANCH_MBOL)
15529             PerlIO_printf(Perl_debug_log, "(MBOL)");
15530         if (r->intflags & PREGf_ANCH_SBOL)
15531             PerlIO_printf(Perl_debug_log, "(SBOL)");
15532         if (r->intflags & PREGf_ANCH_GPOS)
15533             PerlIO_printf(Perl_debug_log, "(GPOS)");
15534         PerlIO_putc(Perl_debug_log, ' ');
15535     }
15536     if (r->intflags & PREGf_GPOS_SEEN)
15537         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15538     if (r->intflags & PREGf_SKIP)
15539         PerlIO_printf(Perl_debug_log, "plus ");
15540     if (r->intflags & PREGf_IMPLICIT)
15541         PerlIO_printf(Perl_debug_log, "implicit ");
15542     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15543     if (r->extflags & RXf_EVAL_SEEN)
15544         PerlIO_printf(Perl_debug_log, "with eval ");
15545     PerlIO_printf(Perl_debug_log, "\n");
15546     DEBUG_FLAGS_r({
15547         regdump_extflags("r->extflags: ",r->extflags);
15548         regdump_intflags("r->intflags: ",r->intflags);
15549     });
15550 #else
15551     PERL_ARGS_ASSERT_REGDUMP;
15552     PERL_UNUSED_CONTEXT;
15553     PERL_UNUSED_ARG(r);
15554 #endif  /* DEBUGGING */
15555 }
15556
15557 /*
15558 - regprop - printable representation of opcode, with run time support
15559 */
15560
15561 void
15562 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15563 {
15564 #ifdef DEBUGGING
15565     dVAR;
15566     int k;
15567
15568     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15569     static const char * const anyofs[] = {
15570 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15571     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15572     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15573     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15574     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15575     || _CC_VERTSPACE != 16
15576   #error Need to adjust order of anyofs[]
15577 #endif
15578         "\\w",
15579         "\\W",
15580         "\\d",
15581         "\\D",
15582         "[:alpha:]",
15583         "[:^alpha:]",
15584         "[:lower:]",
15585         "[:^lower:]",
15586         "[:upper:]",
15587         "[:^upper:]",
15588         "[:punct:]",
15589         "[:^punct:]",
15590         "[:print:]",
15591         "[:^print:]",
15592         "[:alnum:]",
15593         "[:^alnum:]",
15594         "[:graph:]",
15595         "[:^graph:]",
15596         "[:cased:]",
15597         "[:^cased:]",
15598         "\\s",
15599         "\\S",
15600         "[:blank:]",
15601         "[:^blank:]",
15602         "[:xdigit:]",
15603         "[:^xdigit:]",
15604         "[:space:]",
15605         "[:^space:]",
15606         "[:cntrl:]",
15607         "[:^cntrl:]",
15608         "[:ascii:]",
15609         "[:^ascii:]",
15610         "\\v",
15611         "\\V"
15612     };
15613     RXi_GET_DECL(prog,progi);
15614     GET_RE_DEBUG_FLAGS_DECL;
15615
15616     PERL_ARGS_ASSERT_REGPROP;
15617
15618     sv_setpvs(sv, "");
15619
15620     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15621         /* It would be nice to FAIL() here, but this may be called from
15622            regexec.c, and it would be hard to supply pRExC_state. */
15623         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15624                                               (int)OP(o), (int)REGNODE_MAX);
15625     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15626
15627     k = PL_regkind[OP(o)];
15628
15629     if (k == EXACT) {
15630         sv_catpvs(sv, " ");
15631         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15632          * is a crude hack but it may be the best for now since
15633          * we have no flag "this EXACTish node was UTF-8"
15634          * --jhi */
15635         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15636                   PERL_PV_ESCAPE_UNI_DETECT |
15637                   PERL_PV_ESCAPE_NONASCII   |
15638                   PERL_PV_PRETTY_ELLIPSES   |
15639                   PERL_PV_PRETTY_LTGT       |
15640                   PERL_PV_PRETTY_NOCLEAR
15641                   );
15642     } else if (k == TRIE) {
15643         /* print the details of the trie in dumpuntil instead, as
15644          * progi->data isn't available here */
15645         const char op = OP(o);
15646         const U32 n = ARG(o);
15647         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15648                (reg_ac_data *)progi->data->data[n] :
15649                NULL;
15650         const reg_trie_data * const trie
15651             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15652
15653         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15654         DEBUG_TRIE_COMPILE_r(
15655           Perl_sv_catpvf(aTHX_ sv,
15656             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15657             (UV)trie->startstate,
15658             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15659             (UV)trie->wordcount,
15660             (UV)trie->minlen,
15661             (UV)trie->maxlen,
15662             (UV)TRIE_CHARCOUNT(trie),
15663             (UV)trie->uniquecharcount
15664           );
15665         );
15666         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15667             sv_catpvs(sv, "[");
15668             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15669                                                    ? ANYOF_BITMAP(o)
15670                                                    : TRIE_BITMAP(trie));
15671             sv_catpvs(sv, "]");
15672         }
15673
15674     } else if (k == CURLY) {
15675         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15676             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15677         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15678     }
15679     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15680         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15681     else if (k == REF || k == OPEN || k == CLOSE
15682              || k == GROUPP || OP(o)==ACCEPT)
15683     {
15684         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15685         if ( RXp_PAREN_NAMES(prog) ) {
15686             if ( k != REF || (OP(o) < NREF)) {
15687                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15688                 SV **name= av_fetch(list, ARG(o), 0 );
15689                 if (name)
15690                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15691             }
15692             else {
15693                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15694                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15695                 I32 *nums=(I32*)SvPVX(sv_dat);
15696                 SV **name= av_fetch(list, nums[0], 0 );
15697                 I32 n;
15698                 if (name) {
15699                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15700                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15701                                     (n ? "," : ""), (IV)nums[n]);
15702                     }
15703                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15704                 }
15705             }
15706         }
15707         if ( k == REF && reginfo) {
15708             U32 n = ARG(o);  /* which paren pair */
15709             I32 ln = prog->offs[n].start;
15710             if (prog->lastparen < n || ln == -1)
15711                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15712             else if (ln == prog->offs[n].end)
15713                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15714             else {
15715                 const char *s = reginfo->strbeg + ln;
15716                 Perl_sv_catpvf(aTHX_ sv, ": ");
15717                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15718                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15719             }
15720         }
15721     } else if (k == GOSUB)
15722         /* Paren and offset */
15723         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15724     else if (k == VERB) {
15725         if (!o->flags)
15726             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15727                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15728     } else if (k == LOGICAL)
15729         /* 2: embedded, otherwise 1 */
15730         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15731     else if (k == ANYOF) {
15732         const U8 flags = ANYOF_FLAGS(o);
15733         int do_sep = 0;
15734
15735
15736         if (flags & ANYOF_LOCALE_FLAGS)
15737             sv_catpvs(sv, "{loc}");
15738         if (flags & ANYOF_LOC_FOLD)
15739             sv_catpvs(sv, "{i}");
15740         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15741         if (flags & ANYOF_INVERT)
15742             sv_catpvs(sv, "^");
15743
15744         /* output what the standard cp 0-255 bitmap matches */
15745         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15746
15747         /* output any special charclass tests (used entirely under use
15748          * locale) * */
15749         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15750             int i;
15751             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15752                 if (ANYOF_POSIXL_TEST(o,i)) {
15753                     sv_catpv(sv, anyofs[i]);
15754                     do_sep = 1;
15755                 }
15756             }
15757         }
15758
15759         if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15760                       |ANYOF_UTF8
15761                       |ANYOF_NONBITMAP_NON_UTF8
15762                       |ANYOF_LOC_FOLD)))
15763         {
15764             if (do_sep) {
15765                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15766                 if (flags & ANYOF_INVERT)
15767                     /*make sure the invert info is in each */
15768                     sv_catpvs(sv, "^");
15769             }
15770
15771             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15772                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15773             }
15774
15775             /* output information about the unicode matching */
15776             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15777                 sv_catpvs(sv, "{unicode_all}");
15778             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15779                 SV *lv; /* Set if there is something outside the bit map. */
15780                 bool byte_output = FALSE;   /* If something in the bitmap has
15781                                                been output */
15782                 SV *only_utf8_locale;
15783
15784                 /* Get the stuff that wasn't in the bitmap */
15785                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15786                                                     &lv, &only_utf8_locale);
15787                 if (lv && lv != &PL_sv_undef) {
15788                     char *s = savesvpv(lv);
15789                     char * const origs = s;
15790
15791                     while (*s && *s != '\n')
15792                         s++;
15793
15794                     if (*s == '\n') {
15795                         const char * const t = ++s;
15796
15797                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15798                             sv_catpvs(sv, "{outside bitmap}");
15799                         }
15800                         else {
15801                             sv_catpvs(sv, "{utf8}");
15802                         }
15803
15804                         if (byte_output) {
15805                             sv_catpvs(sv, " ");
15806                         }
15807
15808                         while (*s) {
15809                             if (*s == '\n') {
15810
15811                                 /* Truncate very long output */
15812                                 if (s - origs > 256) {
15813                                     Perl_sv_catpvf(aTHX_ sv,
15814                                                 "%.*s...",
15815                                                 (int) (s - origs - 1),
15816                                                 t);
15817                                     goto out_dump;
15818                                 }
15819                                 *s = ' ';
15820                             }
15821                             else if (*s == '\t') {
15822                                 *s = '-';
15823                             }
15824                             s++;
15825                         }
15826                         if (s[-1] == ' ')
15827                             s[-1] = 0;
15828
15829                         sv_catpv(sv, t);
15830                     }
15831
15832                 out_dump:
15833
15834                     Safefree(origs);
15835                     SvREFCNT_dec_NN(lv);
15836                 }
15837
15838                 if ((flags & ANYOF_LOC_FOLD)
15839                      && only_utf8_locale
15840                      && only_utf8_locale != &PL_sv_undef)
15841                 {
15842                     UV start, end;
15843                     int max_entries = 256;
15844
15845                     sv_catpvs(sv, "{utf8 locale}");
15846                     invlist_iterinit(only_utf8_locale);
15847                     while (invlist_iternext(only_utf8_locale,
15848                                             &start, &end)) {
15849                         put_range(sv, start, end);
15850                         max_entries --;
15851                         if (max_entries < 0) {
15852                             sv_catpvs(sv, "...");
15853                             break;
15854                         }
15855                     }
15856                     invlist_iterfinish(only_utf8_locale);
15857                 }
15858             }
15859         }
15860
15861         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15862     }
15863     else if (k == POSIXD || k == NPOSIXD) {
15864         U8 index = FLAGS(o) * 2;
15865         if (index < C_ARRAY_LENGTH(anyofs)) {
15866             if (*anyofs[index] != '[')  {
15867                 sv_catpv(sv, "[");
15868             }
15869             sv_catpv(sv, anyofs[index]);
15870             if (*anyofs[index] != '[')  {
15871                 sv_catpv(sv, "]");
15872             }
15873         }
15874         else {
15875             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15876         }
15877     }
15878     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15879         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15880 #else
15881     PERL_UNUSED_CONTEXT;
15882     PERL_UNUSED_ARG(sv);
15883     PERL_UNUSED_ARG(o);
15884     PERL_UNUSED_ARG(prog);
15885     PERL_UNUSED_ARG(reginfo);
15886 #endif  /* DEBUGGING */
15887 }
15888
15889
15890
15891 SV *
15892 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15893 {                               /* Assume that RE_INTUIT is set */
15894     dVAR;
15895     struct regexp *const prog = ReANY(r);
15896     GET_RE_DEBUG_FLAGS_DECL;
15897
15898     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15899     PERL_UNUSED_CONTEXT;
15900
15901     DEBUG_COMPILE_r(
15902         {
15903             const char * const s = SvPV_nolen_const(prog->check_substr
15904                       ? prog->check_substr : prog->check_utf8);
15905
15906             if (!PL_colorset) reginitcolors();
15907             PerlIO_printf(Perl_debug_log,
15908                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15909                       PL_colors[4],
15910                       prog->check_substr ? "" : "utf8 ",
15911                       PL_colors[5],PL_colors[0],
15912                       s,
15913                       PL_colors[1],
15914                       (strlen(s) > 60 ? "..." : ""));
15915         } );
15916
15917     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15918 }
15919
15920 /*
15921    pregfree()
15922
15923    handles refcounting and freeing the perl core regexp structure. When
15924    it is necessary to actually free the structure the first thing it
15925    does is call the 'free' method of the regexp_engine associated to
15926    the regexp, allowing the handling of the void *pprivate; member
15927    first. (This routine is not overridable by extensions, which is why
15928    the extensions free is called first.)
15929
15930    See regdupe and regdupe_internal if you change anything here.
15931 */
15932 #ifndef PERL_IN_XSUB_RE
15933 void
15934 Perl_pregfree(pTHX_ REGEXP *r)
15935 {
15936     SvREFCNT_dec(r);
15937 }
15938
15939 void
15940 Perl_pregfree2(pTHX_ REGEXP *rx)
15941 {
15942     dVAR;
15943     struct regexp *const r = ReANY(rx);
15944     GET_RE_DEBUG_FLAGS_DECL;
15945
15946     PERL_ARGS_ASSERT_PREGFREE2;
15947
15948     if (r->mother_re) {
15949         ReREFCNT_dec(r->mother_re);
15950     } else {
15951         CALLREGFREE_PVT(rx); /* free the private data */
15952         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15953         Safefree(r->xpv_len_u.xpvlenu_pv);
15954     }
15955     if (r->substrs) {
15956         SvREFCNT_dec(r->anchored_substr);
15957         SvREFCNT_dec(r->anchored_utf8);
15958         SvREFCNT_dec(r->float_substr);
15959         SvREFCNT_dec(r->float_utf8);
15960         Safefree(r->substrs);
15961     }
15962     RX_MATCH_COPY_FREE(rx);
15963 #ifdef PERL_ANY_COW
15964     SvREFCNT_dec(r->saved_copy);
15965 #endif
15966     Safefree(r->offs);
15967     SvREFCNT_dec(r->qr_anoncv);
15968     rx->sv_u.svu_rx = 0;
15969 }
15970
15971 /*  reg_temp_copy()
15972
15973     This is a hacky workaround to the structural issue of match results
15974     being stored in the regexp structure which is in turn stored in
15975     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15976     could be PL_curpm in multiple contexts, and could require multiple
15977     result sets being associated with the pattern simultaneously, such
15978     as when doing a recursive match with (??{$qr})
15979
15980     The solution is to make a lightweight copy of the regexp structure
15981     when a qr// is returned from the code executed by (??{$qr}) this
15982     lightweight copy doesn't actually own any of its data except for
15983     the starp/end and the actual regexp structure itself.
15984
15985 */
15986
15987
15988 REGEXP *
15989 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15990 {
15991     struct regexp *ret;
15992     struct regexp *const r = ReANY(rx);
15993     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15994
15995     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15996
15997     if (!ret_x)
15998         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15999     else {
16000         SvOK_off((SV *)ret_x);
16001         if (islv) {
16002             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16003                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16004                made both spots point to the same regexp body.) */
16005             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16006             assert(!SvPVX(ret_x));
16007             ret_x->sv_u.svu_rx = temp->sv_any;
16008             temp->sv_any = NULL;
16009             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16010             SvREFCNT_dec_NN(temp);
16011             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16012                ing below will not set it. */
16013             SvCUR_set(ret_x, SvCUR(rx));
16014         }
16015     }
16016     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16017        sv_force_normal(sv) is called.  */
16018     SvFAKE_on(ret_x);
16019     ret = ReANY(ret_x);
16020
16021     SvFLAGS(ret_x) |= SvUTF8(rx);
16022     /* We share the same string buffer as the original regexp, on which we
16023        hold a reference count, incremented when mother_re is set below.
16024        The string pointer is copied here, being part of the regexp struct.
16025      */
16026     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16027            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16028     if (r->offs) {
16029         const I32 npar = r->nparens+1;
16030         Newx(ret->offs, npar, regexp_paren_pair);
16031         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16032     }
16033     if (r->substrs) {
16034         Newx(ret->substrs, 1, struct reg_substr_data);
16035         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16036
16037         SvREFCNT_inc_void(ret->anchored_substr);
16038         SvREFCNT_inc_void(ret->anchored_utf8);
16039         SvREFCNT_inc_void(ret->float_substr);
16040         SvREFCNT_inc_void(ret->float_utf8);
16041
16042         /* check_substr and check_utf8, if non-NULL, point to either their
16043            anchored or float namesakes, and don't hold a second reference.  */
16044     }
16045     RX_MATCH_COPIED_off(ret_x);
16046 #ifdef PERL_ANY_COW
16047     ret->saved_copy = NULL;
16048 #endif
16049     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16050     SvREFCNT_inc_void(ret->qr_anoncv);
16051
16052     return ret_x;
16053 }
16054 #endif
16055
16056 /* regfree_internal()
16057
16058    Free the private data in a regexp. This is overloadable by
16059    extensions. Perl takes care of the regexp structure in pregfree(),
16060    this covers the *pprivate pointer which technically perl doesn't
16061    know about, however of course we have to handle the
16062    regexp_internal structure when no extension is in use.
16063
16064    Note this is called before freeing anything in the regexp
16065    structure.
16066  */
16067
16068 void
16069 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16070 {
16071     dVAR;
16072     struct regexp *const r = ReANY(rx);
16073     RXi_GET_DECL(r,ri);
16074     GET_RE_DEBUG_FLAGS_DECL;
16075
16076     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16077
16078     DEBUG_COMPILE_r({
16079         if (!PL_colorset)
16080             reginitcolors();
16081         {
16082             SV *dsv= sv_newmortal();
16083             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16084                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16085             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16086                 PL_colors[4],PL_colors[5],s);
16087         }
16088     });
16089 #ifdef RE_TRACK_PATTERN_OFFSETS
16090     if (ri->u.offsets)
16091         Safefree(ri->u.offsets);             /* 20010421 MJD */
16092 #endif
16093     if (ri->code_blocks) {
16094         int n;
16095         for (n = 0; n < ri->num_code_blocks; n++)
16096             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16097         Safefree(ri->code_blocks);
16098     }
16099
16100     if (ri->data) {
16101         int n = ri->data->count;
16102
16103         while (--n >= 0) {
16104           /* If you add a ->what type here, update the comment in regcomp.h */
16105             switch (ri->data->what[n]) {
16106             case 'a':
16107             case 'r':
16108             case 's':
16109             case 'S':
16110             case 'u':
16111                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16112                 break;
16113             case 'f':
16114                 Safefree(ri->data->data[n]);
16115                 break;
16116             case 'l':
16117             case 'L':
16118                 break;
16119             case 'T':
16120                 { /* Aho Corasick add-on structure for a trie node.
16121                      Used in stclass optimization only */
16122                     U32 refcount;
16123                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16124                     OP_REFCNT_LOCK;
16125                     refcount = --aho->refcount;
16126                     OP_REFCNT_UNLOCK;
16127                     if ( !refcount ) {
16128                         PerlMemShared_free(aho->states);
16129                         PerlMemShared_free(aho->fail);
16130                          /* do this last!!!! */
16131                         PerlMemShared_free(ri->data->data[n]);
16132                         PerlMemShared_free(ri->regstclass);
16133                     }
16134                 }
16135                 break;
16136             case 't':
16137                 {
16138                     /* trie structure. */
16139                     U32 refcount;
16140                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16141                     OP_REFCNT_LOCK;
16142                     refcount = --trie->refcount;
16143                     OP_REFCNT_UNLOCK;
16144                     if ( !refcount ) {
16145                         PerlMemShared_free(trie->charmap);
16146                         PerlMemShared_free(trie->states);
16147                         PerlMemShared_free(trie->trans);
16148                         if (trie->bitmap)
16149                             PerlMemShared_free(trie->bitmap);
16150                         if (trie->jump)
16151                             PerlMemShared_free(trie->jump);
16152                         PerlMemShared_free(trie->wordinfo);
16153                         /* do this last!!!! */
16154                         PerlMemShared_free(ri->data->data[n]);
16155                     }
16156                 }
16157                 break;
16158             default:
16159                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16160                                                     ri->data->what[n]);
16161             }
16162         }
16163         Safefree(ri->data->what);
16164         Safefree(ri->data);
16165     }
16166
16167     Safefree(ri);
16168 }
16169
16170 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16171 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16172 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16173
16174 /*
16175    re_dup - duplicate a regexp.
16176
16177    This routine is expected to clone a given regexp structure. It is only
16178    compiled under USE_ITHREADS.
16179
16180    After all of the core data stored in struct regexp is duplicated
16181    the regexp_engine.dupe method is used to copy any private data
16182    stored in the *pprivate pointer. This allows extensions to handle
16183    any duplication it needs to do.
16184
16185    See pregfree() and regfree_internal() if you change anything here.
16186 */
16187 #if defined(USE_ITHREADS)
16188 #ifndef PERL_IN_XSUB_RE
16189 void
16190 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16191 {
16192     dVAR;
16193     I32 npar;
16194     const struct regexp *r = ReANY(sstr);
16195     struct regexp *ret = ReANY(dstr);
16196
16197     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16198
16199     npar = r->nparens+1;
16200     Newx(ret->offs, npar, regexp_paren_pair);
16201     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16202
16203     if (ret->substrs) {
16204         /* Do it this way to avoid reading from *r after the StructCopy().
16205            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16206            cache, it doesn't matter.  */
16207         const bool anchored = r->check_substr
16208             ? r->check_substr == r->anchored_substr
16209             : r->check_utf8 == r->anchored_utf8;
16210         Newx(ret->substrs, 1, struct reg_substr_data);
16211         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16212
16213         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16214         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16215         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16216         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16217
16218         /* check_substr and check_utf8, if non-NULL, point to either their
16219            anchored or float namesakes, and don't hold a second reference.  */
16220
16221         if (ret->check_substr) {
16222             if (anchored) {
16223                 assert(r->check_utf8 == r->anchored_utf8);
16224                 ret->check_substr = ret->anchored_substr;
16225                 ret->check_utf8 = ret->anchored_utf8;
16226             } else {
16227                 assert(r->check_substr == r->float_substr);
16228                 assert(r->check_utf8 == r->float_utf8);
16229                 ret->check_substr = ret->float_substr;
16230                 ret->check_utf8 = ret->float_utf8;
16231             }
16232         } else if (ret->check_utf8) {
16233             if (anchored) {
16234                 ret->check_utf8 = ret->anchored_utf8;
16235             } else {
16236                 ret->check_utf8 = ret->float_utf8;
16237             }
16238         }
16239     }
16240
16241     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16242     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16243
16244     if (ret->pprivate)
16245         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16246
16247     if (RX_MATCH_COPIED(dstr))
16248         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16249     else
16250         ret->subbeg = NULL;
16251 #ifdef PERL_ANY_COW
16252     ret->saved_copy = NULL;
16253 #endif
16254
16255     /* Whether mother_re be set or no, we need to copy the string.  We
16256        cannot refrain from copying it when the storage points directly to
16257        our mother regexp, because that's
16258                1: a buffer in a different thread
16259                2: something we no longer hold a reference on
16260                so we need to copy it locally.  */
16261     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16262     ret->mother_re   = NULL;
16263 }
16264 #endif /* PERL_IN_XSUB_RE */
16265
16266 /*
16267    regdupe_internal()
16268
16269    This is the internal complement to regdupe() which is used to copy
16270    the structure pointed to by the *pprivate pointer in the regexp.
16271    This is the core version of the extension overridable cloning hook.
16272    The regexp structure being duplicated will be copied by perl prior
16273    to this and will be provided as the regexp *r argument, however
16274    with the /old/ structures pprivate pointer value. Thus this routine
16275    may override any copying normally done by perl.
16276
16277    It returns a pointer to the new regexp_internal structure.
16278 */
16279
16280 void *
16281 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16282 {
16283     dVAR;
16284     struct regexp *const r = ReANY(rx);
16285     regexp_internal *reti;
16286     int len;
16287     RXi_GET_DECL(r,ri);
16288
16289     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16290
16291     len = ProgLen(ri);
16292
16293     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16294           char, regexp_internal);
16295     Copy(ri->program, reti->program, len+1, regnode);
16296
16297     reti->num_code_blocks = ri->num_code_blocks;
16298     if (ri->code_blocks) {
16299         int n;
16300         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16301                 struct reg_code_block);
16302         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16303                 struct reg_code_block);
16304         for (n = 0; n < ri->num_code_blocks; n++)
16305              reti->code_blocks[n].src_regex = (REGEXP*)
16306                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16307     }
16308     else
16309         reti->code_blocks = NULL;
16310
16311     reti->regstclass = NULL;
16312
16313     if (ri->data) {
16314         struct reg_data *d;
16315         const int count = ri->data->count;
16316         int i;
16317
16318         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16319                 char, struct reg_data);
16320         Newx(d->what, count, U8);
16321
16322         d->count = count;
16323         for (i = 0; i < count; i++) {
16324             d->what[i] = ri->data->what[i];
16325             switch (d->what[i]) {
16326                 /* see also regcomp.h and regfree_internal() */
16327             case 'a': /* actually an AV, but the dup function is identical.  */
16328             case 'r':
16329             case 's':
16330             case 'S':
16331             case 'u': /* actually an HV, but the dup function is identical.  */
16332                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16333                 break;
16334             case 'f':
16335                 /* This is cheating. */
16336                 Newx(d->data[i], 1, regnode_ssc);
16337                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16338                 reti->regstclass = (regnode*)d->data[i];
16339                 break;
16340             case 'T':
16341                 /* Trie stclasses are readonly and can thus be shared
16342                  * without duplication. We free the stclass in pregfree
16343                  * when the corresponding reg_ac_data struct is freed.
16344                  */
16345                 reti->regstclass= ri->regstclass;
16346                 /* Fall through */
16347             case 't':
16348                 OP_REFCNT_LOCK;
16349                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16350                 OP_REFCNT_UNLOCK;
16351                 /* Fall through */
16352             case 'l':
16353             case 'L':
16354                 d->data[i] = ri->data->data[i];
16355                 break;
16356             default:
16357                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16358                                                            ri->data->what[i]);
16359             }
16360         }
16361
16362         reti->data = d;
16363     }
16364     else
16365         reti->data = NULL;
16366
16367     reti->name_list_idx = ri->name_list_idx;
16368
16369 #ifdef RE_TRACK_PATTERN_OFFSETS
16370     if (ri->u.offsets) {
16371         Newx(reti->u.offsets, 2*len+1, U32);
16372         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16373     }
16374 #else
16375     SetProgLen(reti,len);
16376 #endif
16377
16378     return (void*)reti;
16379 }
16380
16381 #endif    /* USE_ITHREADS */
16382
16383 #ifndef PERL_IN_XSUB_RE
16384
16385 /*
16386  - regnext - dig the "next" pointer out of a node
16387  */
16388 regnode *
16389 Perl_regnext(pTHX_ regnode *p)
16390 {
16391     dVAR;
16392     I32 offset;
16393
16394     if (!p)
16395         return(NULL);
16396
16397     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16398         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16399                                                 (int)OP(p), (int)REGNODE_MAX);
16400     }
16401
16402     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16403     if (offset == 0)
16404         return(NULL);
16405
16406     return(p+offset);
16407 }
16408 #endif
16409
16410 STATIC void
16411 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16412 {
16413     va_list args;
16414     STRLEN l1 = strlen(pat1);
16415     STRLEN l2 = strlen(pat2);
16416     char buf[512];
16417     SV *msv;
16418     const char *message;
16419
16420     PERL_ARGS_ASSERT_RE_CROAK2;
16421
16422     if (l1 > 510)
16423         l1 = 510;
16424     if (l1 + l2 > 510)
16425         l2 = 510 - l1;
16426     Copy(pat1, buf, l1 , char);
16427     Copy(pat2, buf + l1, l2 , char);
16428     buf[l1 + l2] = '\n';
16429     buf[l1 + l2 + 1] = '\0';
16430     va_start(args, pat2);
16431     msv = vmess(buf, &args);
16432     va_end(args);
16433     message = SvPV_const(msv,l1);
16434     if (l1 > 512)
16435         l1 = 512;
16436     Copy(message, buf, l1 , char);
16437     /* l1-1 to avoid \n */
16438     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16439 }
16440
16441 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16442
16443 #ifndef PERL_IN_XSUB_RE
16444 void
16445 Perl_save_re_context(pTHX)
16446 {
16447     dVAR;
16448
16449     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16450     if (PL_curpm) {
16451         const REGEXP * const rx = PM_GETRE(PL_curpm);
16452         if (rx) {
16453             U32 i;
16454             for (i = 1; i <= RX_NPARENS(rx); i++) {
16455                 char digits[TYPE_CHARS(long)];
16456                 const STRLEN len = my_snprintf(digits, sizeof(digits),
16457                                                "%lu", (long)i);
16458                 GV *const *const gvp
16459                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16460
16461                 if (gvp) {
16462                     GV * const gv = *gvp;
16463                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16464                         save_scalar(gv);
16465                 }
16466             }
16467         }
16468     }
16469 }
16470 #endif
16471
16472 #ifdef DEBUGGING
16473
16474 STATIC void
16475 S_put_byte(pTHX_ SV *sv, int c)
16476 {
16477     PERL_ARGS_ASSERT_PUT_BYTE;
16478
16479     if (!isPRINT(c)) {
16480         switch (c) {
16481             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16482             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16483             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16484             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16485             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16486
16487             default:
16488                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16489                 break;
16490         }
16491     }
16492     else {
16493         const char string = c;
16494         if (c == '-' || c == ']' || c == '\\' || c == '^')
16495             sv_catpvs(sv, "\\");
16496         sv_catpvn(sv, &string, 1);
16497     }
16498 }
16499
16500 STATIC void
16501 S_put_range(pTHX_ SV *sv, UV start, UV end)
16502 {
16503
16504     /* Appends to 'sv' a displayable version of the range of code points from
16505      * 'start' to 'end' */
16506
16507     assert(start <= end);
16508
16509     PERL_ARGS_ASSERT_PUT_RANGE;
16510
16511     if (end - start < 3) {  /* Individual chars in short ranges */
16512         for (; start <= end; start++)
16513             put_byte(sv, start);
16514     }
16515     else if (   end > 255
16516              || ! isALPHANUMERIC(start)
16517              || ! isALPHANUMERIC(end)
16518              || isDIGIT(start) != isDIGIT(end)
16519              || isUPPER(start) != isUPPER(end)
16520              || isLOWER(start) != isLOWER(end)
16521
16522                 /* This final test should get optimized out except on EBCDIC
16523                  * platforms, where it causes ranges that cross discontinuities
16524                  * like i/j to be shown as hex instead of the misleading,
16525                  * e.g. H-K (since that range includes more than H, I, J, K).
16526                  * */
16527              || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16528     {
16529         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16530                        start,
16531                        (end < 256) ? end : 255);
16532     }
16533     else { /* Here, the ends of the range are both digits, or both uppercase,
16534               or both lowercase; and there's no discontinuity in the range
16535               (which could happen on EBCDIC platforms) */
16536         put_byte(sv, start);
16537         sv_catpvs(sv, "-");
16538         put_byte(sv, end);
16539     }
16540 }
16541
16542 STATIC bool
16543 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16544 {
16545     /* Appends to 'sv' a displayable version of the innards of the bracketed
16546      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16547      * output anything */
16548
16549     int i;
16550     bool has_output_anything = FALSE;
16551
16552     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16553
16554     for (i = 0; i < 256; i++) {
16555         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16556
16557             /* The character at index i should be output.  Find the next
16558              * character that should NOT be output */
16559             int j;
16560             for (j = i + 1; j <= 256; j++) {
16561                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16562                     break;
16563                 }
16564             }
16565
16566             /* Everything between them is a single range that should be output
16567              * */
16568             put_range(sv, i, j - 1);
16569             has_output_anything = TRUE;
16570             i = j;
16571         }
16572     }
16573
16574     return has_output_anything;
16575 }
16576
16577 #define CLEAR_OPTSTART \
16578     if (optstart) STMT_START {                                               \
16579         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16580                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16581         optstart=NULL;                                                       \
16582     } STMT_END
16583
16584 #define DUMPUNTIL(b,e)                                                       \
16585                     CLEAR_OPTSTART;                                          \
16586                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16587
16588 STATIC const regnode *
16589 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16590             const regnode *last, const regnode *plast,
16591             SV* sv, I32 indent, U32 depth)
16592 {
16593     dVAR;
16594     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16595     const regnode *next;
16596     const regnode *optstart= NULL;
16597
16598     RXi_GET_DECL(r,ri);
16599     GET_RE_DEBUG_FLAGS_DECL;
16600
16601     PERL_ARGS_ASSERT_DUMPUNTIL;
16602
16603 #ifdef DEBUG_DUMPUNTIL
16604     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16605         last ? last-start : 0,plast ? plast-start : 0);
16606 #endif
16607
16608     if (plast && plast < last)
16609         last= plast;
16610
16611     while (PL_regkind[op] != END && (!last || node < last)) {
16612         /* While that wasn't END last time... */
16613         NODE_ALIGN(node);
16614         op = OP(node);
16615         if (op == CLOSE || op == WHILEM)
16616             indent--;
16617         next = regnext((regnode *)node);
16618
16619         /* Where, what. */
16620         if (OP(node) == OPTIMIZED) {
16621             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16622                 optstart = node;
16623             else
16624                 goto after_print;
16625         } else
16626             CLEAR_OPTSTART;
16627
16628         regprop(r, sv, node, NULL);
16629         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16630                       (int)(2*indent + 1), "", SvPVX_const(sv));
16631
16632         if (OP(node) != OPTIMIZED) {
16633             if (next == NULL)           /* Next ptr. */
16634                 PerlIO_printf(Perl_debug_log, " (0)");
16635             else if (PL_regkind[(U8)op] == BRANCH
16636                      && PL_regkind[OP(next)] != BRANCH )
16637                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16638             else
16639                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16640             (void)PerlIO_putc(Perl_debug_log, '\n');
16641         }
16642
16643       after_print:
16644         if (PL_regkind[(U8)op] == BRANCHJ) {
16645             assert(next);
16646             {
16647                 const regnode *nnode = (OP(next) == LONGJMP
16648                                        ? regnext((regnode *)next)
16649                                        : next);
16650                 if (last && nnode > last)
16651                     nnode = last;
16652                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16653             }
16654         }
16655         else if (PL_regkind[(U8)op] == BRANCH) {
16656             assert(next);
16657             DUMPUNTIL(NEXTOPER(node), next);
16658         }
16659         else if ( PL_regkind[(U8)op]  == TRIE ) {
16660             const regnode *this_trie = node;
16661             const char op = OP(node);
16662             const U32 n = ARG(node);
16663             const reg_ac_data * const ac = op>=AHOCORASICK ?
16664                (reg_ac_data *)ri->data->data[n] :
16665                NULL;
16666             const reg_trie_data * const trie =
16667                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16668 #ifdef DEBUGGING
16669             AV *const trie_words
16670                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16671 #endif
16672             const regnode *nextbranch= NULL;
16673             I32 word_idx;
16674             sv_setpvs(sv, "");
16675             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16676                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16677
16678                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16679                    (int)(2*(indent+3)), "",
16680                     elem_ptr
16681                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16682                                 SvCUR(*elem_ptr), 60,
16683                                 PL_colors[0], PL_colors[1],
16684                                 (SvUTF8(*elem_ptr)
16685                                  ? PERL_PV_ESCAPE_UNI
16686                                  : 0)
16687                                 | PERL_PV_PRETTY_ELLIPSES
16688                                 | PERL_PV_PRETTY_LTGT
16689                             )
16690                     : "???"
16691                 );
16692                 if (trie->jump) {
16693                     U16 dist= trie->jump[word_idx+1];
16694                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16695                                (UV)((dist ? this_trie + dist : next) - start));
16696                     if (dist) {
16697                         if (!nextbranch)
16698                             nextbranch= this_trie + trie->jump[0];
16699                         DUMPUNTIL(this_trie + dist, nextbranch);
16700                     }
16701                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16702                         nextbranch= regnext((regnode *)nextbranch);
16703                 } else {
16704                     PerlIO_printf(Perl_debug_log, "\n");
16705                 }
16706             }
16707             if (last && next > last)
16708                 node= last;
16709             else
16710                 node= next;
16711         }
16712         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16713             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16714                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16715         }
16716         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16717             assert(next);
16718             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16719         }
16720         else if ( op == PLUS || op == STAR) {
16721             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16722         }
16723         else if (PL_regkind[(U8)op] == ANYOF) {
16724             /* arglen 1 + class block */
16725             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16726                           ? ANYOF_POSIXL_SKIP
16727                           : ANYOF_SKIP);
16728             node = NEXTOPER(node);
16729         }
16730         else if (PL_regkind[(U8)op] == EXACT) {
16731             /* Literal string, where present. */
16732             node += NODE_SZ_STR(node) - 1;
16733             node = NEXTOPER(node);
16734         }
16735         else {
16736             node = NEXTOPER(node);
16737             node += regarglen[(U8)op];
16738         }
16739         if (op == CURLYX || op == OPEN)
16740             indent++;
16741     }
16742     CLEAR_OPTSTART;
16743 #ifdef DEBUG_DUMPUNTIL
16744     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16745 #endif
16746     return node;
16747 }
16748
16749 #endif  /* DEBUGGING */
16750
16751 /*
16752  * Local variables:
16753  * c-indentation-style: bsd
16754  * c-basic-offset: 4
16755  * indent-tabs-mode: nil
16756  * End:
16757  *
16758  * ex: set ts=8 sts=4 sw=4 et:
16759  */