src/5020001/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98
  99 #ifndef STATIC
 100 #define STATIC  static
 101 #endif
 102
 103
 104 struct RExC_state_t {
 105     U32         flags;                  /* RXf_* are we folding, multilining? */
 106     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object
 111                                            pprivate field */
 112     char        *start;                 /* Start of input for compile */
 113     char        *end;                   /* End of input for compile */
 114     char        *parse;                 /* Input-scan pointer. */
 115     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 116     regnode     *emit_start;            /* Start of emitted-code area */
 117     regnode     *emit_bound;            /* First regnode outside of the
 118                                            allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 120                                            implies compiling, so don't emit */
 121     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 122                                            large enough for the largest
 123                                            non-EXACTish node, so can use it as
 124                                            scratch in pass1 */
 125     I32         naughty;                /* How bad is this pattern? */
 126     I32         sawback;                /* Did we see \1, ...? */
 127     U32         seen;
 128     SSize_t     size;                   /* Code size. */
 129     I32                npar;            /* Capture buffer count, (OPEN) plus
 130                                            one. ("par" 0 is the whole
 131                                            pattern)*/
 132     I32         nestroot;               /* root parens we are in - used by
 133                                            accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 151                                            through */
 152     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         contains_i;
 156     I32         override_recoding;
 157     I32         in_multi_char_class;
 158     struct reg_code_block *code_blocks; /* positions of literal (?{})
 159                                             within pattern */
 160     int         num_code_blocks;        /* size of code_blocks[] */
 161     int         code_index;             /* next code_blocks[] slot */
 162     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 163 #ifdef ADD_TO_REGEXEC
 164     char        *starttry;              /* -Dr: where regtry was called. */
 165 #define RExC_starttry   (pRExC_state->starttry)
 166 #endif
 167     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169     const char  *lastparse;
 170     I32         lastnum;
 171     AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse  (pRExC_state->lastparse)
 173 #define RExC_lastnum    (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 };
 177
 178 #define RExC_flags      (pRExC_state->flags)
 179 #define RExC_pm_flags   (pRExC_state->pm_flags)
 180 #define RExC_precomp    (pRExC_state->precomp)
 181 #define RExC_rx_sv      (pRExC_state->rx_sv)
 182 #define RExC_rx         (pRExC_state->rx)
 183 #define RExC_rxi        (pRExC_state->rxi)
 184 #define RExC_start      (pRExC_state->start)
 185 #define RExC_end        (pRExC_state->end)
 186 #define RExC_parse      (pRExC_state->parse)
 187 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 190                                                          others */
 191 #endif
 192 #define RExC_emit       (pRExC_state->emit)
 193 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 194 #define RExC_emit_start (pRExC_state->emit_start)
 195 #define RExC_emit_bound (pRExC_state->emit_bound)
 196 #define RExC_naughty    (pRExC_state->naughty)
 197 #define RExC_sawback    (pRExC_state->sawback)
 198 #define RExC_seen       (pRExC_state->seen)
 199 #define RExC_size       (pRExC_state->size)
 200 #define RExC_maxlen        (pRExC_state->maxlen)
 201 #define RExC_npar       (pRExC_state->npar)
 202 #define RExC_nestroot   (pRExC_state->nestroot)
 203 #define RExC_extralen   (pRExC_state->extralen)
 204 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 205 #define RExC_utf8       (pRExC_state->utf8)
 206 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 207 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 208 #define RExC_open_parens        (pRExC_state->open_parens)
 209 #define RExC_close_parens       (pRExC_state->close_parens)
 210 #define RExC_opend      (pRExC_state->opend)
 211 #define RExC_paren_names        (pRExC_state->paren_names)
 212 #define RExC_recurse    (pRExC_state->recurse)
 213 #define RExC_recurse_count      (pRExC_state->recurse_count)
 214 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 215 #define RExC_study_chunk_recursed_bytes  \
 216                                    (pRExC_state->study_chunk_recursed_bytes)
 217 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 218 #define RExC_contains_locale    (pRExC_state->contains_locale)
 219 #define RExC_contains_i (pRExC_state->contains_i)
 220 #define RExC_override_recoding (pRExC_state->override_recoding)
 221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 222
 223
 224 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 225 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 226         ((*s) == '{' && regcurly(s, FALSE)))
 227
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 235  * character.  (There needs to be a case: in the switch statement in regexec.c
 236  * for any node marked SIMPLE.)  Note that this is not the same thing as
 237  * REGNODE_SIMPLE */
 238 #define SIMPLE          0x02
 239 #define SPSTART         0x04    /* Starts with * or + */
 240 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 241 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 242 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 243
 244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 245
 246 /* whether trie related optimizations are enabled */
 247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 248 #define TRIE_STUDY_OPT
 249 #define FULL_TRIE_STUDY
 250 #define TRIE_STCLASS
 251 #endif
 252
 253
 254
 255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 256 #define PBITVAL(paren) (1 << ((paren) & 7))
 257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 260
 261 #define REQUIRE_UTF8    STMT_START {                                       \
 262                                      if (!UTF) {                           \
 263                                          *flagp = RESTART_UTF8;            \
 264                                          return NULL;                      \
 265                                      }                                     \
 266                         } STMT_END
 267
 268 /* This converts the named class defined in regcomp.h to its equivalent class
 269  * number defined in handy.h. */
 270 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 271 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 272
 273 #define _invlist_union_complement_2nd(a, b, output) \
 274                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 275 #define _invlist_intersection_complement_2nd(a, b, output) \
 276                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 277
 278 /* About scan_data_t.
 279
 280   During optimisation we recurse through the regexp program performing
 281   various inplace (keyhole style) optimisations. In addition study_chunk
 282   and scan_commit populate this data structure with information about
 283   what strings MUST appear in the pattern. We look for the longest
 284   string that must appear at a fixed location, and we look for the
 285   longest string that may appear at a floating location. So for instance
 286   in the pattern:
 287
 288     /FOO[xX]A.*B[xX]BAR/
 289
 290   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 291   strings (because they follow a .* construct). study_chunk will identify
 292   both FOO and BAR as being the longest fixed and floating strings respectively.
 293
 294   The strings can be composites, for instance
 295
 296      /(f)(o)(o)/
 297
 298   will result in a composite fixed substring 'foo'.
 299
 300   For each string some basic information is maintained:
 301
 302   - offset or min_offset
 303     This is the position the string must appear at, or not before.
 304     It also implicitly (when combined with minlenp) tells us how many
 305     characters must match before the string we are searching for.
 306     Likewise when combined with minlenp and the length of the string it
 307     tells us how many characters must appear after the string we have
 308     found.
 309
 310   - max_offset
 311     Only used for floating strings. This is the rightmost point that
 312     the string can appear at. If set to SSize_t_MAX it indicates that the
 313     string can occur infinitely far to the right.
 314
 315   - minlenp
 316     A pointer to the minimum number of characters of the pattern that the
 317     string was found inside. This is important as in the case of positive
 318     lookahead or positive lookbehind we can have multiple patterns
 319     involved. Consider
 320
 321     /(?=FOO).*F/
 322
 323     The minimum length of the pattern overall is 3, the minimum length
 324     of the lookahead part is 3, but the minimum length of the part that
 325     will actually match is 1. So 'FOO's minimum length is 3, but the
 326     minimum length for the F is 1. This is important as the minimum length
 327     is used to determine offsets in front of and behind the string being
 328     looked for.  Since strings can be composites this is the length of the
 329     pattern at the time it was committed with a scan_commit. Note that
 330     the length is calculated by study_chunk, so that the minimum lengths
 331     are not known until the full pattern has been compiled, thus the
 332     pointer to the value.
 333
 334   - lookbehind
 335
 336     In the case of lookbehind the string being searched for can be
 337     offset past the start point of the final matching string.
 338     If this value was just blithely removed from the min_offset it would
 339     invalidate some of the calculations for how many chars must match
 340     before or after (as they are derived from min_offset and minlen and
 341     the length of the string being searched for).
 342     When the final pattern is compiled and the data is moved from the
 343     scan_data_t structure into the regexp structure the information
 344     about lookbehind is factored in, with the information that would
 345     have been lost precalculated in the end_shift field for the
 346     associated string.
 347
 348   The fields pos_min and pos_delta are used to store the minimum offset
 349   and the delta to the maximum offset at the current point in the pattern.
 350
 351 */
 352
 353 typedef struct scan_data_t {
 354     /*I32 len_min;      unused */
 355     /*I32 len_delta;    unused */
 356     SSize_t pos_min;
 357     SSize_t pos_delta;
 358     SV *last_found;
 359     SSize_t last_end;       /* min value, <0 unless valid. */
 360     SSize_t last_start_min;
 361     SSize_t last_start_max;
 362     SV **longest;           /* Either &l_fixed, or &l_float. */
 363     SV *longest_fixed;      /* longest fixed string found in pattern */
 364     SSize_t offset_fixed;   /* offset where it starts */
 365     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 366     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 367     SV *longest_float;      /* longest floating string found in pattern */
 368     SSize_t offset_float_min; /* earliest point in string it can appear */
 369     SSize_t offset_float_max; /* latest point in string it can appear */
 370     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 371     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 372     I32 flags;
 373     I32 whilem_c;
 374     SSize_t *last_closep;
 375     regnode_ssc *start_class;
 376 } scan_data_t;
 377
 378 /* The below is perhaps overboard, but this allows us to save a test at the
 379  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 380  * and 'a' differ by a single bit; the same with the upper and lower case of
 381  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 382  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 383  * then inverts it to form a mask, with just a single 0, in the bit position
 384  * where the upper- and lowercase differ.  XXX There are about 40 other
 385  * instances in the Perl core where this micro-optimization could be used.
 386  * Should decide if maintenance cost is worse, before changing those
 387  *
 388  * Returns a boolean as to whether or not 'v' is either a lowercase or
 389  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 390  * compile-time constant, the generated code is better than some optimizing
 391  * compilers figure out, amounting to a mask and test.  The results are
 392  * meaningless if 'c' is not one of [A-Za-z] */
 393 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 394                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 395
 396 /*
 397  * Forward declarations for pregcomp()'s friends.
 398  */
 399
 400 static const scan_data_t zero_scan_data =
 401   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 402
 403 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 404 #define SF_BEFORE_SEOL          0x0001
 405 #define SF_BEFORE_MEOL          0x0002
 406 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 407 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 408
 409 #define SF_FIX_SHIFT_EOL        (+2)
 410 #define SF_FL_SHIFT_EOL         (+4)
 411
 412 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 413 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 414
 415 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 416 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 417 #define SF_IS_INF               0x0040
 418 #define SF_HAS_PAR              0x0080
 419 #define SF_IN_PAR               0x0100
 420 #define SF_HAS_EVAL             0x0200
 421 #define SCF_DO_SUBSTR           0x0400
 422 #define SCF_DO_STCLASS_AND      0x0800
 423 #define SCF_DO_STCLASS_OR       0x1000
 424 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 425 #define SCF_WHILEM_VISITED_POS  0x2000
 426
 427 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 428 #define SCF_SEEN_ACCEPT         0x8000
 429 #define SCF_TRIE_DOING_RESTUDY 0x10000
 430
 431 #define UTF cBOOL(RExC_utf8)
 432
 433 /* The enums for all these are ordered so things work out correctly */
 434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 436                                                      == REGEX_DEPENDS_CHARSET)
 437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 439                                                      >= REGEX_UNICODE_CHARSET)
 440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 441                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 443                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 445                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 446
 447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 448
 449 /* For programs that want to be strictly Unicode compatible by dying if any
 450  * attempt is made to match a non-Unicode code point against a Unicode
 451  * property.  */
 452 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 453
 454 #define OOB_NAMEDCLASS          -1
 455
 456 /* There is no code point that is out-of-bounds, so this is problematic.  But
 457  * its only current use is to initialize a variable that is always set before
 458  * looked at. */
 459 #define OOB_UNICODE             0xDEADBEEF
 460
 461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 463
 464
 465 /* length of regex to show in messages that don't mark a position within */
 466 #define RegexLengthToShowInErrorMessages 127
 467
 468 /*
 469  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 470  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 471  * op/pragma/warn/regcomp.
 472  */
 473 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 474 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 475
 476 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 477                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 478
 479 #define REPORT_LOCATION_ARGS(offset)            \
 480                 UTF8fARG(UTF, offset, RExC_precomp), \
 481                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 485  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 486  * "...".
 487  */
 488 #define _FAIL(code) STMT_START {                                        \
 489     const char *ellipses = "";                                          \
 490     IV len = RExC_end - RExC_precomp;                                   \
 491                                                                         \
 492     if (!SIZE_ONLY)                                                     \
 493         SAVEFREESV(RExC_rx_sv);                                         \
 494     if (len > RegexLengthToShowInErrorMessages) {                       \
 495         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 496         len = RegexLengthToShowInErrorMessages - 10;                    \
 497         ellipses = "...";                                               \
 498     }                                                                   \
 499     code;                                                               \
 500 } STMT_END
 501
 502 #define FAIL(msg) _FAIL(                            \
 503     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 504             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 505
 506 #define FAIL2(msg,arg) _FAIL(                       \
 507     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 508             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 509
 510 /*
 511  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 512  */
 513 #define Simple_vFAIL(m) STMT_START {                                    \
 514     const IV offset = RExC_parse - RExC_precomp;                        \
 515     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 516             m, REPORT_LOCATION_ARGS(offset));   \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 521  */
 522 #define vFAIL(m) STMT_START {                           \
 523     if (!SIZE_ONLY)                                     \
 524         SAVEFREESV(RExC_rx_sv);                         \
 525     Simple_vFAIL(m);                                    \
 526 } STMT_END
 527
 528 /*
 529  * Like Simple_vFAIL(), but accepts two arguments.
 530  */
 531 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 532     const IV offset = RExC_parse - RExC_precomp;                        \
 533     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 534                       REPORT_LOCATION_ARGS(offset));    \
 535 } STMT_END
 536
 537 /*
 538  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 539  */
 540 #define vFAIL2(m,a1) STMT_START {                       \
 541     if (!SIZE_ONLY)                                     \
 542         SAVEFREESV(RExC_rx_sv);                         \
 543     Simple_vFAIL2(m, a1);                               \
 544 } STMT_END
 545
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts three arguments.
 549  */
 550 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 551     const IV offset = RExC_parse - RExC_precomp;                \
 552     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 553             REPORT_LOCATION_ARGS(offset));      \
 554 } STMT_END
 555
 556 /*
 557  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 558  */
 559 #define vFAIL3(m,a1,a2) STMT_START {                    \
 560     if (!SIZE_ONLY)                                     \
 561         SAVEFREESV(RExC_rx_sv);                         \
 562     Simple_vFAIL3(m, a1, a2);                           \
 563 } STMT_END
 564
 565 /*
 566  * Like Simple_vFAIL(), but accepts four arguments.
 567  */
 568 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 569     const IV offset = RExC_parse - RExC_precomp;                \
 570     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 571             REPORT_LOCATION_ARGS(offset));      \
 572 } STMT_END
 573
 574 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 575     if (!SIZE_ONLY)                                     \
 576         SAVEFREESV(RExC_rx_sv);                         \
 577     Simple_vFAIL4(m, a1, a2, a3);                       \
 578 } STMT_END
 579
 580 /* A specialized version of vFAIL2 that works with UTF8f */
 581 #define vFAIL2utf8f(m, a1) STMT_START { \
 582     const IV offset = RExC_parse - RExC_precomp;   \
 583     if (!SIZE_ONLY)                                \
 584         SAVEFREESV(RExC_rx_sv);                    \
 585     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 586             REPORT_LOCATION_ARGS(offset));         \
 587 } STMT_END
 588
 589
 590 /* m is not necessarily a "literal string", in this macro */
 591 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 592     const IV offset = loc - RExC_precomp;                               \
 593     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 594             m, REPORT_LOCATION_ARGS(offset));       \
 595 } STMT_END
 596
 597 #define ckWARNreg(loc,m) STMT_START {                                   \
 598     const IV offset = loc - RExC_precomp;                               \
 599     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 600             REPORT_LOCATION_ARGS(offset));              \
 601 } STMT_END
 602
 603 #define vWARN_dep(loc, m) STMT_START {                                  \
 604     const IV offset = loc - RExC_precomp;                               \
 605     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 606             REPORT_LOCATION_ARGS(offset));              \
 607 } STMT_END
 608
 609 #define ckWARNdep(loc,m) STMT_START {                                   \
 610     const IV offset = loc - RExC_precomp;                               \
 611     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 612             m REPORT_LOCATION,                                          \
 613             REPORT_LOCATION_ARGS(offset));              \
 614 } STMT_END
 615
 616 #define ckWARNregdep(loc,m) STMT_START {                                \
 617     const IV offset = loc - RExC_precomp;                               \
 618     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 619             m REPORT_LOCATION,                                          \
 620             REPORT_LOCATION_ARGS(offset));              \
 621 } STMT_END
 622
 623 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 624     const IV offset = loc - RExC_precomp;                               \
 625     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 626             m REPORT_LOCATION,                                          \
 627             a1, REPORT_LOCATION_ARGS(offset));  \
 628 } STMT_END
 629
 630 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 631     const IV offset = loc - RExC_precomp;                               \
 632     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 633             a1, REPORT_LOCATION_ARGS(offset));  \
 634 } STMT_END
 635
 636 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 637     const IV offset = loc - RExC_precomp;                               \
 638     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 639             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 640 } STMT_END
 641
 642 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 643     const IV offset = loc - RExC_precomp;                               \
 644     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 645             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 646 } STMT_END
 647
 648 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 649     const IV offset = loc - RExC_precomp;                               \
 650     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 651             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 652 } STMT_END
 653
 654 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 655     const IV offset = loc - RExC_precomp;                               \
 656     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 657             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 658 } STMT_END
 659
 660 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 661     const IV offset = loc - RExC_precomp;                               \
 662     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 663             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 664 } STMT_END
 665
 666
 667 /* Allow for side effects in s */
 668 #define REGC(c,s) STMT_START {                  \
 669     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 670 } STMT_END
 671
 672 /* Macros for recording node offsets.   20001227 mjd@plover.com
 673  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 674  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 675  * Element 0 holds the number n.
 676  * Position is 1 indexed.
 677  */
 678 #ifndef RE_TRACK_PATTERN_OFFSETS
 679 #define Set_Node_Offset_To_R(node,byte)
 680 #define Set_Node_Offset(node,byte)
 681 #define Set_Cur_Node_Offset
 682 #define Set_Node_Length_To_R(node,len)
 683 #define Set_Node_Length(node,len)
 684 #define Set_Node_Cur_Length(node,start)
 685 #define Node_Offset(n)
 686 #define Node_Length(n)
 687 #define Set_Node_Offset_Length(node,offset,len)
 688 #define ProgLen(ri) ri->u.proglen
 689 #define SetProgLen(ri,x) ri->u.proglen = x
 690 #else
 691 #define ProgLen(ri) ri->u.offsets[0]
 692 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 693 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 694     if (! SIZE_ONLY) {                                                  \
 695         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 696                     __LINE__, (int)(node), (int)(byte)));               \
 697         if((node) < 0) {                                                \
 698             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 699                                          (int)(node));                  \
 700         } else {                                                        \
 701             RExC_offsets[2*(node)-1] = (byte);                          \
 702         }                                                               \
 703     }                                                                   \
 704 } STMT_END
 705
 706 #define Set_Node_Offset(node,byte) \
 707     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 709
 710 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 711     if (! SIZE_ONLY) {                                                  \
 712         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 713                 __LINE__, (int)(node), (int)(len)));                    \
 714         if((node) < 0) {                                                \
 715             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 716                                          (int)(node));                  \
 717         } else {                                                        \
 718             RExC_offsets[2*(node)] = (len);                             \
 719         }                                                               \
 720     }                                                                   \
 721 } STMT_END
 722
 723 #define Set_Node_Length(node,len) \
 724     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 725 #define Set_Node_Cur_Length(node, start)                \
 726     Set_Node_Length(node, RExC_parse - start)
 727
 728 /* Get offsets and lengths */
 729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 731
 732 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 733     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 734     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 735 } STMT_END
 736 #endif
 737
 738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 739 #define EXPERIMENTAL_INPLACESCAN
 740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 741
 742 #define DEBUG_RExC_seen() \
 743         DEBUG_OPTIMISE_MORE_r({                                             \
 744             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 745                                                                             \
 746             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 747                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 748                                                                             \
 749             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 750                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 751                                                                             \
 752             if (RExC_seen & REG_GPOS_SEEN)                                  \
 753                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 754                                                                             \
 755             if (RExC_seen & REG_CANY_SEEN)                                  \
 756                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 757                                                                             \
 758             if (RExC_seen & REG_RECURSE_SEEN)                               \
 759                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 760                                                                             \
 761             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 762                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 763                                                                             \
 764             if (RExC_seen & REG_VERBARG_SEEN)                               \
 765                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 766                                                                             \
 767             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 768                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 769                                                                             \
 770             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 771                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 772                                                                             \
 773             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 774                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 775                                                                             \
 776             if (RExC_seen & REG_GOSTART_SEEN)                               \
 777                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 778                                                                             \
 779             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 780                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 781                                                                             \
 782             PerlIO_printf(Perl_debug_log,"\n");                             \
 783         });
 784
 785 #define DEBUG_STUDYDATA(str,data,depth)                              \
 786 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 787     PerlIO_printf(Perl_debug_log,                                    \
 788         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 789         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 790         (int)(depth)*2, "",                                          \
 791         (IV)((data)->pos_min),                                       \
 792         (IV)((data)->pos_delta),                                     \
 793         (UV)((data)->flags),                                         \
 794         (IV)((data)->whilem_c),                                      \
 795         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 796         is_inf ? "INF " : ""                                         \
 797     );                                                               \
 798     if ((data)->last_found)                                          \
 799         PerlIO_printf(Perl_debug_log,                                \
 800             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 801             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 802             SvPVX_const((data)->last_found),                         \
 803             (IV)((data)->last_end),                                  \
 804             (IV)((data)->last_start_min),                            \
 805             (IV)((data)->last_start_max),                            \
 806             ((data)->longest &&                                      \
 807              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 808             SvPVX_const((data)->longest_fixed),                      \
 809             (IV)((data)->offset_fixed),                              \
 810             ((data)->longest &&                                      \
 811              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 812             SvPVX_const((data)->longest_float),                      \
 813             (IV)((data)->offset_float_min),                          \
 814             (IV)((data)->offset_float_max)                           \
 815         );                                                           \
 816     PerlIO_printf(Perl_debug_log,"\n");                              \
 817 });
 818
 819 /* Mark that we cannot extend a found fixed substring at this point.
 820    Update the longest found anchored substring and the longest found
 821    floating substrings if needed. */
 822
 823 STATIC void
 824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 825                     SSize_t *minlenp, int is_inf)
 826 {
 827     const STRLEN l = CHR_SVLEN(data->last_found);
 828     const STRLEN old_l = CHR_SVLEN(*data->longest);
 829     GET_RE_DEBUG_FLAGS_DECL;
 830
 831     PERL_ARGS_ASSERT_SCAN_COMMIT;
 832
 833     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 834         SvSetMagicSV(*data->longest, data->last_found);
 835         if (*data->longest == data->longest_fixed) {
 836             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 837             if (data->flags & SF_BEFORE_EOL)
 838                 data->flags
 839                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 840             else
 841                 data->flags &= ~SF_FIX_BEFORE_EOL;
 842             data->minlen_fixed=minlenp;
 843             data->lookbehind_fixed=0;
 844         }
 845         else { /* *data->longest == data->longest_float */
 846             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 847             data->offset_float_max = (l
 848                                       ? data->last_start_max
 849                                       : (data->pos_delta == SSize_t_MAX
 850                                          ? SSize_t_MAX
 851                                          : data->pos_min + data->pos_delta));
 852             if (is_inf
 853                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 854                 data->offset_float_max = SSize_t_MAX;
 855             if (data->flags & SF_BEFORE_EOL)
 856                 data->flags
 857                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 858             else
 859                 data->flags &= ~SF_FL_BEFORE_EOL;
 860             data->minlen_float=minlenp;
 861             data->lookbehind_float=0;
 862         }
 863     }
 864     SvCUR_set(data->last_found, 0);
 865     {
 866         SV * const sv = data->last_found;
 867         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 868             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 869             if (mg)
 870                 mg->mg_len = 0;
 871         }
 872     }
 873     data->last_end = -1;
 874     data->flags &= ~SF_BEFORE_EOL;
 875     DEBUG_STUDYDATA("commit: ",data,0);
 876 }
 877
 878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 879  * list that describes which code points it matches */
 880
 881 STATIC void
 882 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 883 {
 884     /* Set the SSC 'ssc' to match an empty string or any code point */
 885
 886     PERL_ARGS_ASSERT_SSC_ANYTHING;
 887
 888     assert(is_ANYOF_SYNTHETIC(ssc));
 889
 890     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 891     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 892     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 893 }
 894
 895 STATIC int
 896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 897 {
 898     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 899      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 900      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 901      * in any way, so there's no point in using it */
 902
 903     UV start, end;
 904     bool ret;
 905
 906     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 907
 908     assert(is_ANYOF_SYNTHETIC(ssc));
 909
 910     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 911         return FALSE;
 912     }
 913
 914     /* See if the list consists solely of the range 0 - Infinity */
 915     invlist_iterinit(ssc->invlist);
 916     ret = invlist_iternext(ssc->invlist, &start, &end)
 917           && start == 0
 918           && end == UV_MAX;
 919
 920     invlist_iterfinish(ssc->invlist);
 921
 922     if (ret) {
 923         return TRUE;
 924     }
 925
 926     /* If e.g., both \w and \W are set, matches everything */
 927     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 928         int i;
 929         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 930             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 931                 return TRUE;
 932             }
 933         }
 934     }
 935
 936     return FALSE;
 937 }
 938
 939 STATIC void
 940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 941 {
 942     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 943      * string, any code point, or any posix class under locale */
 944
 945     PERL_ARGS_ASSERT_SSC_INIT;
 946
 947     Zero(ssc, 1, regnode_ssc);
 948     set_ANYOF_SYNTHETIC(ssc);
 949     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 950     ssc_anything(ssc);
 951
 952     /* If any portion of the regex is to operate under locale rules,
 953      * initialization includes it.  The reason this isn't done for all regexes
 954      * is that the optimizer was written under the assumption that locale was
 955      * all-or-nothing.  Given the complexity and lack of documentation in the
 956      * optimizer, and that there are inadequate test cases for locale, many
 957      * parts of it may not work properly, it is safest to avoid locale unless
 958      * necessary. */
 959     if (RExC_contains_locale) {
 960         ANYOF_POSIXL_SETALL(ssc);
 961     }
 962     else {
 963         ANYOF_POSIXL_ZERO(ssc);
 964     }
 965 }
 966
 967 STATIC int
 968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 969                               const regnode_ssc *ssc)
 970 {
 971     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 972      * to the list of code points matched, and locale posix classes; hence does
 973      * not check its flags) */
 974
 975     UV start, end;
 976     bool ret;
 977
 978     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 979
 980     assert(is_ANYOF_SYNTHETIC(ssc));
 981
 982     invlist_iterinit(ssc->invlist);
 983     ret = invlist_iternext(ssc->invlist, &start, &end)
 984           && start == 0
 985           && end == UV_MAX;
 986
 987     invlist_iterfinish(ssc->invlist);
 988
 989     if (! ret) {
 990         return FALSE;
 991     }
 992
 993     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 994         return FALSE;
 995     }
 996
 997     return TRUE;
 998 }
 999
1000 STATIC SV*
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002                                const regnode_charclass* const node)
1003 {
1004     /* Returns a mortal inversion list defining which code points are matched
1005      * by 'node', which is of type ANYOF.  Handles complementing the result if
1006      * appropriate.  If some code points aren't knowable at this time, the
1007      * returned list must, and will, contain every code point that is a
1008      * possibility. */
1009
1010     SV* invlist = sv_2mortal(_new_invlist(0));
1011     SV* only_utf8_locale_invlist = NULL;
1012     unsigned int i;
1013     const U32 n = ARG(node);
1014     bool new_node_has_latin1 = FALSE;
1015
1016     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017
1018     /* Look at the data structure created by S_set_ANYOF_arg() */
1019     if (n != ANYOF_NONBITMAP_EMPTY) {
1020         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1021         AV * const av = MUTABLE_AV(SvRV(rv));
1022         SV **const ary = AvARRAY(av);
1023         assert(RExC_rxi->data->what[n] == 's');
1024
1025         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1026             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027         }
1028         else if (ary[0] && ary[0] != &PL_sv_undef) {
1029
1030             /* Here, no compile-time swash, and there are things that won't be
1031              * known until runtime -- we have to assume it could be anything */
1032             return _add_range_to_invlist(invlist, 0, UV_MAX);
1033         }
1034         else if (ary[3] && ary[3] != &PL_sv_undef) {
1035
1036             /* Here no compile-time swash, and no run-time only data.  Use the
1037              * node's inversion list */
1038             invlist = sv_2mortal(invlist_clone(ary[3]));
1039         }
1040
1041         /* Get the code points valid only under UTF-8 locales */
1042         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1043             && ary[2] && ary[2] != &PL_sv_undef)
1044         {
1045             only_utf8_locale_invlist = ary[2];
1046         }
1047     }
1048
1049     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1050      * inversion list for the others, but if there are code points that should
1051      * match only conditionally on the target string being UTF-8, those are
1052      * placed in the inversion list, and not the bitmap.  Since there are
1053      * circumstances under which they could match, they are included in the
1054      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1055      * here, so that when we invert below, the end result actually does include
1056      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1057      * before we add the unconditionally matched code points */
1058     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1059         _invlist_intersection_complement_2nd(invlist,
1060                                              PL_UpperLatin1,
1061                                              &invlist);
1062     }
1063
1064     /* Add in the points from the bit map */
1065     for (i = 0; i < 256; i++) {
1066         if (ANYOF_BITMAP_TEST(node, i)) {
1067             invlist = add_cp_to_invlist(invlist, i);
1068             new_node_has_latin1 = TRUE;
1069         }
1070     }
1071
1072     /* If this can match all upper Latin1 code points, have to add them
1073      * as well */
1074     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1075         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1076     }
1077
1078     /* Similarly for these */
1079     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1080         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1081     }
1082
1083     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1084         _invlist_invert(invlist);
1085     }
1086     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1087
1088         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1089          * locale.  We can skip this if there are no 0-255 at all. */
1090         _invlist_union(invlist, PL_Latin1, &invlist);
1091     }
1092
1093     /* Similarly add the UTF-8 locale possible matches.  These have to be
1094      * deferred until after the non-UTF-8 locale ones are taken care of just
1095      * above, or it leads to wrong results under ANYOF_INVERT */
1096     if (only_utf8_locale_invlist) {
1097         _invlist_union_maybe_complement_2nd(invlist,
1098                                             only_utf8_locale_invlist,
1099                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1100                                             &invlist);
1101     }
1102
1103     return invlist;
1104 }
1105
1106 /* These two functions currently do the exact same thing */
1107 #define ssc_init_zero           ssc_init
1108
1109 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1111
1112 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1113  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1114  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1115
1116 STATIC void
1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1118                 const regnode_charclass *and_with)
1119 {
1120     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1121      * another SSC or a regular ANYOF class.  Can create false positives. */
1122
1123     SV* anded_cp_list;
1124     U8  anded_flags;
1125
1126     PERL_ARGS_ASSERT_SSC_AND;
1127
1128     assert(is_ANYOF_SYNTHETIC(ssc));
1129
1130     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1131      * the code point inversion list and just the relevant flags */
1132     if (is_ANYOF_SYNTHETIC(and_with)) {
1133         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1134         anded_flags = ANYOF_FLAGS(and_with);
1135
1136         /* XXX This is a kludge around what appears to be deficiencies in the
1137          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1138          * there are paths through the optimizer where it doesn't get weeded
1139          * out when it should.  And if we don't make some extra provision for
1140          * it like the code just below, it doesn't get added when it should.
1141          * This solution is to add it only when AND'ing, which is here, and
1142          * only when what is being AND'ed is the pristine, original node
1143          * matching anything.  Thus it is like adding it to ssc_anything() but
1144          * only when the result is to be AND'ed.  Probably the same solution
1145          * could be adopted for the same problem we have with /l matching,
1146          * which is solved differently in S_ssc_init(), and that would lead to
1147          * fewer false positives than that solution has.  But if this solution
1148          * creates bugs, the consequences are only that a warning isn't raised
1149          * that should be; while the consequences for having /l bugs is
1150          * incorrect matches */
1151         if (ssc_is_anything((regnode_ssc *)and_with)) {
1152             anded_flags |= ANYOF_WARN_SUPER;
1153         }
1154     }
1155     else {
1156         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1157         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1158     }
1159
1160     ANYOF_FLAGS(ssc) &= anded_flags;
1161
1162     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1163      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1164      * 'and_with' may be inverted.  When not inverted, we have the situation of
1165      * computing:
1166      *  (C1 | P1) & (C2 | P2)
1167      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1168      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1169      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1170      *                    <=  ((C1 & C2) | P1 | P2)
1171      * Alternatively, the last few steps could be:
1172      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1173      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1174      *                    <=  (C1 | C2 | (P1 & P2))
1175      * We favor the second approach if either P1 or P2 is non-empty.  This is
1176      * because these components are a barrier to doing optimizations, as what
1177      * they match cannot be known until the moment of matching as they are
1178      * dependent on the current locale, 'AND"ing them likely will reduce or
1179      * eliminate them.
1180      * But we can do better if we know that C1,P1 are in their initial state (a
1181      * frequent occurrence), each matching everything:
1182      *  (<everything>) & (C2 | P2) =  C2 | P2
1183      * Similarly, if C2,P2 are in their initial state (again a frequent
1184      * occurrence), the result is a no-op
1185      *  (C1 | P1) & (<everything>) =  C1 | P1
1186      *
1187      * Inverted, we have
1188      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1189      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1190      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1191      * */
1192
1193     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1194         && ! is_ANYOF_SYNTHETIC(and_with))
1195     {
1196         unsigned int i;
1197
1198         ssc_intersection(ssc,
1199                          anded_cp_list,
1200                          FALSE /* Has already been inverted */
1201                          );
1202
1203         /* If either P1 or P2 is empty, the intersection will be also; can skip
1204          * the loop */
1205         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1206             ANYOF_POSIXL_ZERO(ssc);
1207         }
1208         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1209
1210             /* Note that the Posix class component P from 'and_with' actually
1211              * looks like:
1212              *      P = Pa | Pb | ... | Pn
1213              * where each component is one posix class, such as in [\w\s].
1214              * Thus
1215              *      ~P = ~(Pa | Pb | ... | Pn)
1216              *         = ~Pa & ~Pb & ... & ~Pn
1217              *        <= ~Pa | ~Pb | ... | ~Pn
1218              * The last is something we can easily calculate, but unfortunately
1219              * is likely to have many false positives.  We could do better
1220              * in some (but certainly not all) instances if two classes in
1221              * P have known relationships.  For example
1222              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1223              * So
1224              *      :lower: & :print: = :lower:
1225              * And similarly for classes that must be disjoint.  For example,
1226              * since \s and \w can have no elements in common based on rules in
1227              * the POSIX standard,
1228              *      \w & ^\S = nothing
1229              * Unfortunately, some vendor locales do not meet the Posix
1230              * standard, in particular almost everything by Microsoft.
1231              * The loop below just changes e.g., \w into \W and vice versa */
1232
1233             regnode_charclass_posixl temp;
1234             int add = 1;    /* To calculate the index of the complement */
1235
1236             ANYOF_POSIXL_ZERO(&temp);
1237             for (i = 0; i < ANYOF_MAX; i++) {
1238                 assert(i % 2 != 0
1239                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1240                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1241
1242                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1243                     ANYOF_POSIXL_SET(&temp, i + add);
1244                 }
1245                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1246             }
1247             ANYOF_POSIXL_AND(&temp, ssc);
1248
1249         } /* else ssc already has no posixes */
1250     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1251          in its initial state */
1252     else if (! is_ANYOF_SYNTHETIC(and_with)
1253              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1254     {
1255         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1256          * copy it over 'ssc' */
1257         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1258             if (is_ANYOF_SYNTHETIC(and_with)) {
1259                 StructCopy(and_with, ssc, regnode_ssc);
1260             }
1261             else {
1262                 ssc->invlist = anded_cp_list;
1263                 ANYOF_POSIXL_ZERO(ssc);
1264                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1265                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1266                 }
1267             }
1268         }
1269         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1270                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1271         {
1272             /* One or the other of P1, P2 is non-empty. */
1273             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1275             }
1276             ssc_union(ssc, anded_cp_list, FALSE);
1277         }
1278         else { /* P1 = P2 = empty */
1279             ssc_intersection(ssc, anded_cp_list, FALSE);
1280         }
1281     }
1282 }
1283
1284 STATIC void
1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1286                const regnode_charclass *or_with)
1287 {
1288     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1289      * another SSC or a regular ANYOF class.  Can create false positives if
1290      * 'or_with' is to be inverted. */
1291
1292     SV* ored_cp_list;
1293     U8 ored_flags;
1294
1295     PERL_ARGS_ASSERT_SSC_OR;
1296
1297     assert(is_ANYOF_SYNTHETIC(ssc));
1298
1299     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1300      * the code point inversion list and just the relevant flags */
1301     if (is_ANYOF_SYNTHETIC(or_with)) {
1302         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1303         ored_flags = ANYOF_FLAGS(or_with);
1304     }
1305     else {
1306         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1307         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1308     }
1309
1310     ANYOF_FLAGS(ssc) |= ored_flags;
1311
1312     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1313      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1314      * 'or_with' may be inverted.  When not inverted, we have the simple
1315      * situation of computing:
1316      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1317      * If P1|P2 yields a situation with both a class and its complement are
1318      * set, like having both \w and \W, this matches all code points, and we
1319      * can delete these from the P component of the ssc going forward.  XXX We
1320      * might be able to delete all the P components, but I (khw) am not certain
1321      * about this, and it is better to be safe.
1322      *
1323      * Inverted, we have
1324      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1325      *                         <=  (C1 | P1) | ~C2
1326      *                         <=  (C1 | ~C2) | P1
1327      * (which results in actually simpler code than the non-inverted case)
1328      * */
1329
1330     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1331         && ! is_ANYOF_SYNTHETIC(or_with))
1332     {
1333         /* We ignore P2, leaving P1 going forward */
1334     }   /* else  Not inverted */
1335     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1336         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1337         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1338             unsigned int i;
1339             for (i = 0; i < ANYOF_MAX; i += 2) {
1340                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1341                 {
1342                     ssc_match_all_cp(ssc);
1343                     ANYOF_POSIXL_CLEAR(ssc, i);
1344                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1345                 }
1346             }
1347         }
1348     }
1349
1350     ssc_union(ssc,
1351               ored_cp_list,
1352               FALSE /* Already has been inverted */
1353               );
1354 }
1355
1356 PERL_STATIC_INLINE void
1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1358 {
1359     PERL_ARGS_ASSERT_SSC_UNION;
1360
1361     assert(is_ANYOF_SYNTHETIC(ssc));
1362
1363     _invlist_union_maybe_complement_2nd(ssc->invlist,
1364                                         invlist,
1365                                         invert2nd,
1366                                         &ssc->invlist);
1367 }
1368
1369 PERL_STATIC_INLINE void
1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1371                          SV* const invlist,
1372                          const bool invert2nd)
1373 {
1374     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1375
1376     assert(is_ANYOF_SYNTHETIC(ssc));
1377
1378     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1379                                                invlist,
1380                                                invert2nd,
1381                                                &ssc->invlist);
1382 }
1383
1384 PERL_STATIC_INLINE void
1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1386 {
1387     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1388
1389     assert(is_ANYOF_SYNTHETIC(ssc));
1390
1391     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1392 }
1393
1394 PERL_STATIC_INLINE void
1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1396 {
1397     /* AND just the single code point 'cp' into the SSC 'ssc' */
1398
1399     SV* cp_list = _new_invlist(2);
1400
1401     PERL_ARGS_ASSERT_SSC_CP_AND;
1402
1403     assert(is_ANYOF_SYNTHETIC(ssc));
1404
1405     cp_list = add_cp_to_invlist(cp_list, cp);
1406     ssc_intersection(ssc, cp_list,
1407                      FALSE /* Not inverted */
1408                      );
1409     SvREFCNT_dec_NN(cp_list);
1410 }
1411
1412 PERL_STATIC_INLINE void
1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1414 {
1415     /* Set the SSC 'ssc' to not match any locale things */
1416
1417     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1418
1419     assert(is_ANYOF_SYNTHETIC(ssc));
1420
1421     ANYOF_POSIXL_ZERO(ssc);
1422     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1423 }
1424
1425 STATIC void
1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1427 {
1428     /* The inversion list in the SSC is marked mortal; now we need a more
1429      * permanent copy, which is stored the same way that is done in a regular
1430      * ANYOF node, with the first 256 code points in a bit map */
1431
1432     SV* invlist = invlist_clone(ssc->invlist);
1433
1434     PERL_ARGS_ASSERT_SSC_FINALIZE;
1435
1436     assert(is_ANYOF_SYNTHETIC(ssc));
1437
1438     /* The code in this file assumes that all but these flags aren't relevant
1439      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440      * time we reach here */
1441     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1442
1443     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1444
1445     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1446                                 NULL, NULL, NULL, FALSE);
1447
1448     /* Make sure is clone-safe */
1449     ssc->invlist = NULL;
1450
1451     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1452         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1453     }
1454
1455     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1456 }
1457
1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1459 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1461 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1462                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1463                                : 0 )
1464
1465
1466 #ifdef DEBUGGING
1467 /*
1468    dump_trie(trie,widecharmap,revcharmap)
1469    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1470    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1471
1472    These routines dump out a trie in a somewhat readable format.
1473    The _interim_ variants are used for debugging the interim
1474    tables that are used to generate the final compressed
1475    representation which is what dump_trie expects.
1476
1477    Part of the reason for their existence is to provide a form
1478    of documentation as to how the different representations function.
1479
1480 */
1481
1482 /*
1483   Dumps the final compressed table form of the trie to Perl_debug_log.
1484   Used for debugging make_trie().
1485 */
1486
1487 STATIC void
1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1489             AV *revcharmap, U32 depth)
1490 {
1491     U32 state;
1492     SV *sv=sv_newmortal();
1493     int colwidth= widecharmap ? 6 : 4;
1494     U16 word;
1495     GET_RE_DEBUG_FLAGS_DECL;
1496
1497     PERL_ARGS_ASSERT_DUMP_TRIE;
1498
1499     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1500         (int)depth * 2 + 2,"",
1501         "Match","Base","Ofs" );
1502
1503     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1504         SV ** const tmp = av_fetch( revcharmap, state, 0);
1505         if ( tmp ) {
1506             PerlIO_printf( Perl_debug_log, "%*s",
1507                 colwidth,
1508                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1509                             PL_colors[0], PL_colors[1],
1510                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1511                             PERL_PV_ESCAPE_FIRSTCHAR
1512                 )
1513             );
1514         }
1515     }
1516     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1517         (int)depth * 2 + 2,"");
1518
1519     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1520         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1521     PerlIO_printf( Perl_debug_log, "\n");
1522
1523     for( state = 1 ; state < trie->statecount ; state++ ) {
1524         const U32 base = trie->states[ state ].trans.base;
1525
1526         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1527                                        (int)depth * 2 + 2,"", (UV)state);
1528
1529         if ( trie->states[ state ].wordnum ) {
1530             PerlIO_printf( Perl_debug_log, " W%4X",
1531                                            trie->states[ state ].wordnum );
1532         } else {
1533             PerlIO_printf( Perl_debug_log, "%6s", "" );
1534         }
1535
1536         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1537
1538         if ( base ) {
1539             U32 ofs = 0;
1540
1541             while( ( base + ofs  < trie->uniquecharcount ) ||
1542                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1543                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1544                                                                     != state))
1545                     ofs++;
1546
1547             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1548
1549             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1550                 if ( ( base + ofs >= trie->uniquecharcount )
1551                         && ( base + ofs - trie->uniquecharcount
1552                                                         < trie->lasttrans )
1553                         && trie->trans[ base + ofs
1554                                     - trie->uniquecharcount ].check == state )
1555                 {
1556                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1557                     colwidth,
1558                     (UV)trie->trans[ base + ofs
1559                                              - trie->uniquecharcount ].next );
1560                 } else {
1561                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1562                 }
1563             }
1564
1565             PerlIO_printf( Perl_debug_log, "]");
1566
1567         }
1568         PerlIO_printf( Perl_debug_log, "\n" );
1569     }
1570     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1571                                 (int)depth*2, "");
1572     for (word=1; word <= trie->wordcount; word++) {
1573         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1574             (int)word, (int)(trie->wordinfo[word].prev),
1575             (int)(trie->wordinfo[word].len));
1576     }
1577     PerlIO_printf(Perl_debug_log, "\n" );
1578 }
1579 /*
1580   Dumps a fully constructed but uncompressed trie in list form.
1581   List tries normally only are used for construction when the number of
1582   possible chars (trie->uniquecharcount) is very high.
1583   Used for debugging make_trie().
1584 */
1585 STATIC void
1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1587                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1588                          U32 depth)
1589 {
1590     U32 state;
1591     SV *sv=sv_newmortal();
1592     int colwidth= widecharmap ? 6 : 4;
1593     GET_RE_DEBUG_FLAGS_DECL;
1594
1595     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1596
1597     /* print out the table precompression.  */
1598     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1599         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1600         "------:-----+-----------------\n" );
1601
1602     for( state=1 ; state < next_alloc ; state ++ ) {
1603         U16 charid;
1604
1605         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1606             (int)depth * 2 + 2,"", (UV)state  );
1607         if ( ! trie->states[ state ].wordnum ) {
1608             PerlIO_printf( Perl_debug_log, "%5s| ","");
1609         } else {
1610             PerlIO_printf( Perl_debug_log, "W%4x| ",
1611                 trie->states[ state ].wordnum
1612             );
1613         }
1614         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1615             SV ** const tmp = av_fetch( revcharmap,
1616                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1617             if ( tmp ) {
1618                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1619                     colwidth,
1620                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1621                               colwidth,
1622                               PL_colors[0], PL_colors[1],
1623                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1624                               | PERL_PV_ESCAPE_FIRSTCHAR
1625                     ) ,
1626                     TRIE_LIST_ITEM(state,charid).forid,
1627                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1628                 );
1629                 if (!(charid % 10))
1630                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1631                         (int)((depth * 2) + 14), "");
1632             }
1633         }
1634         PerlIO_printf( Perl_debug_log, "\n");
1635     }
1636 }
1637
1638 /*
1639   Dumps a fully constructed but uncompressed trie in table form.
1640   This is the normal DFA style state transition table, with a few
1641   twists to facilitate compression later.
1642   Used for debugging make_trie().
1643 */
1644 STATIC void
1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1646                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1647                           U32 depth)
1648 {
1649     U32 state;
1650     U16 charid;
1651     SV *sv=sv_newmortal();
1652     int colwidth= widecharmap ? 6 : 4;
1653     GET_RE_DEBUG_FLAGS_DECL;
1654
1655     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1656
1657     /*
1658        print out the table precompression so that we can do a visual check
1659        that they are identical.
1660      */
1661
1662     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1663
1664     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1665         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1666         if ( tmp ) {
1667             PerlIO_printf( Perl_debug_log, "%*s",
1668                 colwidth,
1669                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670                             PL_colors[0], PL_colors[1],
1671                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672                             PERL_PV_ESCAPE_FIRSTCHAR
1673                 )
1674             );
1675         }
1676     }
1677
1678     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1679
1680     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1681         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1682     }
1683
1684     PerlIO_printf( Perl_debug_log, "\n" );
1685
1686     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1687
1688         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1689             (int)depth * 2 + 2,"",
1690             (UV)TRIE_NODENUM( state ) );
1691
1692         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1693             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1694             if (v)
1695                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1696             else
1697                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1698         }
1699         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1700             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1701                                             (UV)trie->trans[ state ].check );
1702         } else {
1703             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1704                                             (UV)trie->trans[ state ].check,
1705             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1706         }
1707     }
1708 }
1709
1710 #endif
1711
1712
1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1714   startbranch: the first branch in the whole branch sequence
1715   first      : start branch of sequence of branch-exact nodes.
1716                May be the same as startbranch
1717   last       : Thing following the last branch.
1718                May be the same as tail.
1719   tail       : item following the branch sequence
1720   count      : words in the sequence
1721   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1722   depth      : indent depth
1723
1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1725
1726 A trie is an N'ary tree where the branches are determined by digital
1727 decomposition of the key. IE, at the root node you look up the 1st character and
1728 follow that branch repeat until you find the end of the branches. Nodes can be
1729 marked as "accepting" meaning they represent a complete word. Eg:
1730
1731   /he|she|his|hers/
1732
1733 would convert into the following structure. Numbers represent states, letters
1734 following numbers represent valid transitions on the letter from that state, if
1735 the number is in square brackets it represents an accepting state, otherwise it
1736 will be in parenthesis.
1737
1738       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1739       |    |
1740       |   (2)
1741       |    |
1742      (1)   +-i->(6)-+-s->[7]
1743       |
1744       +-s->(3)-+-h->(4)-+-e->[5]
1745
1746       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1747
1748 This shows that when matching against the string 'hers' we will begin at state 1
1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1752 single traverse. We store a mapping from accepting to state to which word was
1753 matched, and then when we have multiple possibilities we try to complete the
1754 rest of the regex in the order in which they occured in the alternation.
1755
1756 The only prior NFA like behaviour that would be changed by the TRIE support is
1757 the silent ignoring of duplicate alternations which are of the form:
1758
1759  / (DUPE|DUPE) X? (?{ ... }) Y /x
1760
1761 Thus EVAL blocks following a trie may be called a different number of times with
1762 and without the optimisation. With the optimisations dupes will be silently
1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1764 the following demonstrates:
1765
1766  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1767
1768 which prints out 'word' three times, but
1769
1770  'words'=~/(word|word|word)(?{ print $1 })S/
1771
1772 which doesnt print it out at all. This is due to other optimisations kicking in.
1773
1774 Example of what happens on a structural level:
1775
1776 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1777
1778    1: CURLYM[1] {1,32767}(18)
1779    5:   BRANCH(8)
1780    6:     EXACT <ac>(16)
1781    8:   BRANCH(11)
1782    9:     EXACT <ad>(16)
1783   11:   BRANCH(14)
1784   12:     EXACT <ab>(16)
1785   16:   SUCCEED(0)
1786   17:   NOTHING(18)
1787   18: END(0)
1788
1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1790 and should turn into:
1791
1792    1: CURLYM[1] {1,32767}(18)
1793    5:   TRIE(16)
1794         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1795           <ac>
1796           <ad>
1797           <ab>
1798   16:   SUCCEED(0)
1799   17:   NOTHING(18)
1800   18: END(0)
1801
1802 Cases where tail != last would be like /(?foo|bar)baz/:
1803
1804    1: BRANCH(4)
1805    2:   EXACT <foo>(8)
1806    4: BRANCH(7)
1807    5:   EXACT <bar>(8)
1808    7: TAIL(8)
1809    8: EXACT <baz>(10)
1810   10: END(0)
1811
1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1813 and would end up looking like:
1814
1815     1: TRIE(8)
1816       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1817         <foo>
1818         <bar>
1819    7: TAIL(8)
1820    8: EXACT <baz>(10)
1821   10: END(0)
1822
1823     d = uvchr_to_utf8_flags(d, uv, 0);
1824
1825 is the recommended Unicode-aware way of saying
1826
1827     *(d++) = uv;
1828 */
1829
1830 #define TRIE_STORE_REVCHAR(val)                                            \
1831     STMT_START {                                                           \
1832         if (UTF) {                                                         \
1833             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1834             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1835             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1836             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1837             SvPOK_on(zlopp);                                               \
1838             SvUTF8_on(zlopp);                                              \
1839             av_push(revcharmap, zlopp);                                    \
1840         } else {                                                           \
1841             char ooooff = (char)val;                                           \
1842             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1843         }                                                                  \
1844         } STMT_END
1845
1846 /* This gets the next character from the input, folding it if not already
1847  * folded. */
1848 #define TRIE_READ_CHAR STMT_START {                                           \
1849     wordlen++;                                                                \
1850     if ( UTF ) {                                                              \
1851         /* if it is UTF then it is either already folded, or does not need    \
1852          * folding */                                                         \
1853         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1854     }                                                                         \
1855     else if (folder == PL_fold_latin1) {                                      \
1856         /* This folder implies Unicode rules, which in the range expressible  \
1857          *  by not UTF is the lower case, with the two exceptions, one of     \
1858          *  which should have been taken care of before calling this */       \
1859         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1860         uvc = toLOWER_L1(*uc);                                                \
1861         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1862         len = 1;                                                              \
1863     } else {                                                                  \
1864         /* raw data, will be folded later if needed */                        \
1865         uvc = (U32)*uc;                                                       \
1866         len = 1;                                                              \
1867     }                                                                         \
1868 } STMT_END
1869
1870
1871
1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1873     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1874         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1875         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1876     }                                                           \
1877     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1878     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1879     TRIE_LIST_CUR( state )++;                                   \
1880 } STMT_END
1881
1882 #define TRIE_LIST_NEW(state) STMT_START {                       \
1883     Newxz( trie->states[ state ].trans.list,               \
1884         4, reg_trie_trans_le );                                 \
1885      TRIE_LIST_CUR( state ) = 1;                                \
1886      TRIE_LIST_LEN( state ) = 4;                                \
1887 } STMT_END
1888
1889 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1890     U16 dupe= trie->states[ state ].wordnum;                    \
1891     regnode * const noper_next = regnext( noper );              \
1892                                                                 \
1893     DEBUG_r({                                                   \
1894         /* store the word for dumping */                        \
1895         SV* tmp;                                                \
1896         if (OP(noper) != NOTHING)                               \
1897             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1898         else                                                    \
1899             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1900         av_push( trie_words, tmp );                             \
1901     });                                                         \
1902                                                                 \
1903     curword++;                                                  \
1904     trie->wordinfo[curword].prev   = 0;                         \
1905     trie->wordinfo[curword].len    = wordlen;                   \
1906     trie->wordinfo[curword].accept = state;                     \
1907                                                                 \
1908     if ( noper_next < tail ) {                                  \
1909         if (!trie->jump)                                        \
1910             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1911                                                  sizeof(U16) ); \
1912         trie->jump[curword] = (U16)(noper_next - convert);      \
1913         if (!jumper)                                            \
1914             jumper = noper_next;                                \
1915         if (!nextbranch)                                        \
1916             nextbranch= regnext(cur);                           \
1917     }                                                           \
1918                                                                 \
1919     if ( dupe ) {                                               \
1920         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1921         /* chain, so that when the bits of chain are later    */\
1922         /* linked together, the dups appear in the chain      */\
1923         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1924         trie->wordinfo[dupe].prev = curword;                    \
1925     } else {                                                    \
1926         /* we haven't inserted this word yet.                */ \
1927         trie->states[ state ].wordnum = curword;                \
1928     }                                                           \
1929 } STMT_END
1930
1931
1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1933      ( ( base + charid >=  ucharcount                                   \
1934          && base + charid < ubound                                      \
1935          && state == trie->trans[ base - ucharcount + charid ].check    \
1936          && trie->trans[ base - ucharcount + charid ].next )            \
1937            ? trie->trans[ base - ucharcount + charid ].next             \
1938            : ( state==1 ? special : 0 )                                 \
1939       )
1940
1941 #define MADE_TRIE       1
1942 #define MADE_JUMP_TRIE  2
1943 #define MADE_EXACT_TRIE 4
1944
1945 STATIC I32
1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1947                   regnode *first, regnode *last, regnode *tail,
1948                   U32 word_count, U32 flags, U32 depth)
1949 {
1950     dVAR;
1951     /* first pass, loop through and scan words */
1952     reg_trie_data *trie;
1953     HV *widecharmap = NULL;
1954     AV *revcharmap = newAV();
1955     regnode *cur;
1956     STRLEN len = 0;
1957     UV uvc = 0;
1958     U16 curword = 0;
1959     U32 next_alloc = 0;
1960     regnode *jumper = NULL;
1961     regnode *nextbranch = NULL;
1962     regnode *convert = NULL;
1963     U32 *prev_states; /* temp array mapping each state to previous one */
1964     /* we just use folder as a flag in utf8 */
1965     const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969     AV *trie_words = NULL;
1970     /* along with revcharmap, this only used during construction but both are
1971      * useful during debugging so we store them in the struct when debugging.
1972      */
1973 #else
1974     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975     STRLEN trie_charcount=0;
1976 #endif
1977     SV *re_trie_maxbuff;
1978     GET_RE_DEBUG_FLAGS_DECL;
1979
1980     PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982     PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985     switch (flags) {
1986         case EXACT: break;
1987         case EXACTFA:
1988         case EXACTFU_SS:
1989         case EXACTFU: folder = PL_fold_latin1; break;
1990         case EXACTF:  folder = PL_fold; break;
1991         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992     }
1993
1994     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995     trie->refcount = 1;
1996     trie->startstate = 1;
1997     trie->wordcount = word_count;
1998     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000     if (flags == EXACT)
2001         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005     DEBUG_r({
2006         trie_words = newAV();
2007     });
2008
2009     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010     if (!SvIOK(re_trie_maxbuff)) {
2011         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2012     }
2013     DEBUG_TRIE_COMPILE_r({
2014         PerlIO_printf( Perl_debug_log,
2015           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2016           (int)depth * 2 + 2, "",
2017           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2018           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2019     });
2020
2021    /* Find the node we are going to overwrite */
2022     if ( first == startbranch && OP( last ) != BRANCH ) {
2023         /* whole branch chain */
2024         convert = first;
2025     } else {
2026         /* branch sub-chain */
2027         convert = NEXTOPER( first );
2028     }
2029
2030     /*  -- First loop and Setup --
2031
2032        We first traverse the branches and scan each word to determine if it
2033        contains widechars, and how many unique chars there are, this is
2034        important as we have to build a table with at least as many columns as we
2035        have unique chars.
2036
2037        We use an array of integers to represent the character codes 0..255
2038        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2039        the native representation of the character value as the key and IV's for
2040        the coded index.
2041
2042        *TODO* If we keep track of how many times each character is used we can
2043        remap the columns so that the table compression later on is more
2044        efficient in terms of memory by ensuring the most common value is in the
2045        middle and the least common are on the outside.  IMO this would be better
2046        than a most to least common mapping as theres a decent chance the most
2047        common letter will share a node with the least common, meaning the node
2048        will not be compressible. With a middle is most common approach the worst
2049        case is when we have the least common nodes twice.
2050
2051      */
2052
2053     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2054         regnode *noper = NEXTOPER( cur );
2055         const U8 *uc = (U8*)STRING( noper );
2056         const U8 *e  = uc + STR_LEN( noper );
2057         int foldlen = 0;
2058         U32 wordlen      = 0;         /* required init */
2059         STRLEN minchars = 0;
2060         STRLEN maxchars = 0;
2061         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2062                                                bitmap?*/
2063
2064         if (OP(noper) == NOTHING) {
2065             regnode *noper_next= regnext(noper);
2066             if (noper_next != tail && OP(noper_next) == flags) {
2067                 noper = noper_next;
2068                 uc= (U8*)STRING(noper);
2069                 e= uc + STR_LEN(noper);
2070                 trie->minlen= STR_LEN(noper);
2071             } else {
2072                 trie->minlen= 0;
2073                 continue;
2074             }
2075         }
2076
2077         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2078             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2079                                           regardless of encoding */
2080             if (OP( noper ) == EXACTFU_SS) {
2081                 /* false positives are ok, so just set this */
2082                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2083             }
2084         }
2085         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2086                                            branch */
2087             TRIE_CHARCOUNT(trie)++;
2088             TRIE_READ_CHAR;
2089
2090             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2091              * is in effect.  Under /i, this character can match itself, or
2092              * anything that folds to it.  If not under /i, it can match just
2093              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2094              * all fold to k, and all are single characters.   But some folds
2095              * expand to more than one character, so for example LATIN SMALL
2096              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2097              * the string beginning at 'uc' is 'ffi', it could be matched by
2098              * three characters, or just by the one ligature character. (It
2099              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2100              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2101              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2102              * match.)  The trie needs to know the minimum and maximum number
2103              * of characters that could match so that it can use size alone to
2104              * quickly reject many match attempts.  The max is simple: it is
2105              * the number of folded characters in this branch (since a fold is
2106              * never shorter than what folds to it. */
2107
2108             maxchars++;
2109
2110             /* And the min is equal to the max if not under /i (indicated by
2111              * 'folder' being NULL), or there are no multi-character folds.  If
2112              * there is a multi-character fold, the min is incremented just
2113              * once, for the character that folds to the sequence.  Each
2114              * character in the sequence needs to be added to the list below of
2115              * characters in the trie, but we count only the first towards the
2116              * min number of characters needed.  This is done through the
2117              * variable 'foldlen', which is returned by the macros that look
2118              * for these sequences as the number of bytes the sequence
2119              * occupies.  Each time through the loop, we decrement 'foldlen' by
2120              * how many bytes the current char occupies.  Only when it reaches
2121              * 0 do we increment 'minchars' or look for another multi-character
2122              * sequence. */
2123             if (folder == NULL) {
2124                 minchars++;
2125             }
2126             else if (foldlen > 0) {
2127                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2128             }
2129             else {
2130                 minchars++;
2131
2132                 /* See if *uc is the beginning of a multi-character fold.  If
2133                  * so, we decrement the length remaining to look at, to account
2134                  * for the current character this iteration.  (We can use 'uc'
2135                  * instead of the fold returned by TRIE_READ_CHAR because for
2136                  * non-UTF, the latin1_safe macro is smart enough to account
2137                  * for all the unfolded characters, and because for UTF, the
2138                  * string will already have been folded earlier in the
2139                  * compilation process */
2140                 if (UTF) {
2141                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2142                         foldlen -= UTF8SKIP(uc);
2143                     }
2144                 }
2145                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2146                     foldlen--;
2147                 }
2148             }
2149
2150             /* The current character (and any potential folds) should be added
2151              * to the possible matching characters for this position in this
2152              * branch */
2153             if ( uvc < 256 ) {
2154                 if ( folder ) {
2155                     U8 folded= folder[ (U8) uvc ];
2156                     if ( !trie->charmap[ folded ] ) {
2157                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2158                         TRIE_STORE_REVCHAR( folded );
2159                     }
2160                 }
2161                 if ( !trie->charmap[ uvc ] ) {
2162                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2163                     TRIE_STORE_REVCHAR( uvc );
2164                 }
2165                 if ( set_bit ) {
2166                     /* store the codepoint in the bitmap, and its folded
2167                      * equivalent. */
2168                     TRIE_BITMAP_SET(trie, uvc);
2169
2170                     /* store the folded codepoint */
2171                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2172
2173                     if ( !UTF ) {
2174                         /* store first byte of utf8 representation of
2175                            variant codepoints */
2176                         if (! UVCHR_IS_INVARIANT(uvc)) {
2177                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2178                         }
2179                     }
2180                     set_bit = 0; /* We've done our bit :-) */
2181                 }
2182             } else {
2183
2184                 /* XXX We could come up with the list of code points that fold
2185                  * to this using PL_utf8_foldclosures, except not for
2186                  * multi-char folds, as there may be multiple combinations
2187                  * there that could work, which needs to wait until runtime to
2188                  * resolve (The comment about LIGATURE FFI above is such an
2189                  * example */
2190
2191                 SV** svpp;
2192                 if ( !widecharmap )
2193                     widecharmap = newHV();
2194
2195                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2196
2197                 if ( !svpp )
2198                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2199
2200                 if ( !SvTRUE( *svpp ) ) {
2201                     sv_setiv( *svpp, ++trie->uniquecharcount );
2202                     TRIE_STORE_REVCHAR(uvc);
2203                 }
2204             }
2205         } /* end loop through characters in this branch of the trie */
2206
2207         /* We take the min and max for this branch and combine to find the min
2208          * and max for all branches processed so far */
2209         if( cur == first ) {
2210             trie->minlen = minchars;
2211             trie->maxlen = maxchars;
2212         } else if (minchars < trie->minlen) {
2213             trie->minlen = minchars;
2214         } else if (maxchars > trie->maxlen) {
2215             trie->maxlen = maxchars;
2216         }
2217     } /* end first pass */
2218     DEBUG_TRIE_COMPILE_r(
2219         PerlIO_printf( Perl_debug_log,
2220                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2221                 (int)depth * 2 + 2,"",
2222                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2223                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2224                 (int)trie->minlen, (int)trie->maxlen )
2225     );
2226
2227     /*
2228         We now know what we are dealing with in terms of unique chars and
2229         string sizes so we can calculate how much memory a naive
2230         representation using a flat table  will take. If it's over a reasonable
2231         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2232         conservative but potentially much slower representation using an array
2233         of lists.
2234
2235         At the end we convert both representations into the same compressed
2236         form that will be used in regexec.c for matching with. The latter
2237         is a form that cannot be used to construct with but has memory
2238         properties similar to the list form and access properties similar
2239         to the table form making it both suitable for fast searches and
2240         small enough that its feasable to store for the duration of a program.
2241
2242         See the comment in the code where the compressed table is produced
2243         inplace from the flat tabe representation for an explanation of how
2244         the compression works.
2245
2246     */
2247
2248
2249     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2250     prev_states[1] = 0;
2251
2252     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2253                                                     > SvIV(re_trie_maxbuff) )
2254     {
2255         /*
2256             Second Pass -- Array Of Lists Representation
2257
2258             Each state will be represented by a list of charid:state records
2259             (reg_trie_trans_le) the first such element holds the CUR and LEN
2260             points of the allocated array. (See defines above).
2261
2262             We build the initial structure using the lists, and then convert
2263             it into the compressed table form which allows faster lookups
2264             (but cant be modified once converted).
2265         */
2266
2267         STRLEN transcount = 1;
2268
2269         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2270             "%*sCompiling trie using list compiler\n",
2271             (int)depth * 2 + 2, ""));
2272
2273         trie->states = (reg_trie_state *)
2274             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2275                                   sizeof(reg_trie_state) );
2276         TRIE_LIST_NEW(1);
2277         next_alloc = 2;
2278
2279         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2280
2281             regnode *noper   = NEXTOPER( cur );
2282             U8 *uc           = (U8*)STRING( noper );
2283             const U8 *e      = uc + STR_LEN( noper );
2284             U32 state        = 1;         /* required init */
2285             U16 charid       = 0;         /* sanity init */
2286             U32 wordlen      = 0;         /* required init */
2287
2288             if (OP(noper) == NOTHING) {
2289                 regnode *noper_next= regnext(noper);
2290                 if (noper_next != tail && OP(noper_next) == flags) {
2291                     noper = noper_next;
2292                     uc= (U8*)STRING(noper);
2293                     e= uc + STR_LEN(noper);
2294                 }
2295             }
2296
2297             if (OP(noper) != NOTHING) {
2298                 for ( ; uc < e ; uc += len ) {
2299
2300                     TRIE_READ_CHAR;
2301
2302                     if ( uvc < 256 ) {
2303                         charid = trie->charmap[ uvc ];
2304                     } else {
2305                         SV** const svpp = hv_fetch( widecharmap,
2306                                                     (char*)&uvc,
2307                                                     sizeof( UV ),
2308                                                     0);
2309                         if ( !svpp ) {
2310                             charid = 0;
2311                         } else {
2312                             charid=(U16)SvIV( *svpp );
2313                         }
2314                     }
2315                     /* charid is now 0 if we dont know the char read, or
2316                      * nonzero if we do */
2317                     if ( charid ) {
2318
2319                         U16 check;
2320                         U32 newstate = 0;
2321
2322                         charid--;
2323                         if ( !trie->states[ state ].trans.list ) {
2324                             TRIE_LIST_NEW( state );
2325                         }
2326                         for ( check = 1;
2327                               check <= TRIE_LIST_USED( state );
2328                               check++ )
2329                         {
2330                             if ( TRIE_LIST_ITEM( state, check ).forid
2331                                                                     == charid )
2332                             {
2333                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2334                                 break;
2335                             }
2336                         }
2337                         if ( ! newstate ) {
2338                             newstate = next_alloc++;
2339                             prev_states[newstate] = state;
2340                             TRIE_LIST_PUSH( state, charid, newstate );
2341                             transcount++;
2342                         }
2343                         state = newstate;
2344                     } else {
2345                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2346                     }
2347                 }
2348             }
2349             TRIE_HANDLE_WORD(state);
2350
2351         } /* end second pass */
2352
2353         /* next alloc is the NEXT state to be allocated */
2354         trie->statecount = next_alloc;
2355         trie->states = (reg_trie_state *)
2356             PerlMemShared_realloc( trie->states,
2357                                    next_alloc
2358                                    * sizeof(reg_trie_state) );
2359
2360         /* and now dump it out before we compress it */
2361         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2362                                                          revcharmap, next_alloc,
2363                                                          depth+1)
2364         );
2365
2366         trie->trans = (reg_trie_trans *)
2367             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2368         {
2369             U32 state;
2370             U32 tp = 0;
2371             U32 zp = 0;
2372
2373
2374             for( state=1 ; state < next_alloc ; state ++ ) {
2375                 U32 base=0;
2376
2377                 /*
2378                 DEBUG_TRIE_COMPILE_MORE_r(
2379                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2380                 );
2381                 */
2382
2383                 if (trie->states[state].trans.list) {
2384                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2385                     U16 maxid=minid;
2386                     U16 idx;
2387
2388                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2389                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2390                         if ( forid < minid ) {
2391                             minid=forid;
2392                         } else if ( forid > maxid ) {
2393                             maxid=forid;
2394                         }
2395                     }
2396                     if ( transcount < tp + maxid - minid + 1) {
2397                         transcount *= 2;
2398                         trie->trans = (reg_trie_trans *)
2399                             PerlMemShared_realloc( trie->trans,
2400                                                      transcount
2401                                                      * sizeof(reg_trie_trans) );
2402                         Zero( trie->trans + (transcount / 2),
2403                               transcount / 2,
2404                               reg_trie_trans );
2405                     }
2406                     base = trie->uniquecharcount + tp - minid;
2407                     if ( maxid == minid ) {
2408                         U32 set = 0;
2409                         for ( ; zp < tp ; zp++ ) {
2410                             if ( ! trie->trans[ zp ].next ) {
2411                                 base = trie->uniquecharcount + zp - minid;
2412                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2413                                                                    1).newstate;
2414                                 trie->trans[ zp ].check = state;
2415                                 set = 1;
2416                                 break;
2417                             }
2418                         }
2419                         if ( !set ) {
2420                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2421                                                                    1).newstate;
2422                             trie->trans[ tp ].check = state;
2423                             tp++;
2424                             zp = tp;
2425                         }
2426                     } else {
2427                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2428                             const U32 tid = base
2429                                            - trie->uniquecharcount
2430                                            + TRIE_LIST_ITEM( state, idx ).forid;
2431                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2432                                                                 idx ).newstate;
2433                             trie->trans[ tid ].check = state;
2434                         }
2435                         tp += ( maxid - minid + 1 );
2436                     }
2437                     Safefree(trie->states[ state ].trans.list);
2438                 }
2439                 /*
2440                 DEBUG_TRIE_COMPILE_MORE_r(
2441                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2442                 );
2443                 */
2444                 trie->states[ state ].trans.base=base;
2445             }
2446             trie->lasttrans = tp + 1;
2447         }
2448     } else {
2449         /*
2450            Second Pass -- Flat Table Representation.
2451
2452            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2453            each.  We know that we will need Charcount+1 trans at most to store
2454            the data (one row per char at worst case) So we preallocate both
2455            structures assuming worst case.
2456
2457            We then construct the trie using only the .next slots of the entry
2458            structs.
2459
2460            We use the .check field of the first entry of the node temporarily
2461            to make compression both faster and easier by keeping track of how
2462            many non zero fields are in the node.
2463
2464            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2465            transition.
2466
2467            There are two terms at use here: state as a TRIE_NODEIDX() which is
2468            a number representing the first entry of the node, and state as a
2469            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2470            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2471            if there are 2 entrys per node. eg:
2472
2473              A B       A B
2474           1. 2 4    1. 3 7
2475           2. 0 3    3. 0 5
2476           3. 0 0    5. 0 0
2477           4. 0 0    7. 0 0
2478
2479            The table is internally in the right hand, idx form. However as we
2480            also have to deal with the states array which is indexed by nodenum
2481            we have to use TRIE_NODENUM() to convert.
2482
2483         */
2484         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2485             "%*sCompiling trie using table compiler\n",
2486             (int)depth * 2 + 2, ""));
2487
2488         trie->trans = (reg_trie_trans *)
2489             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2490                                   * trie->uniquecharcount + 1,
2491                                   sizeof(reg_trie_trans) );
2492         trie->states = (reg_trie_state *)
2493             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2494                                   sizeof(reg_trie_state) );
2495         next_alloc = trie->uniquecharcount + 1;
2496
2497
2498         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2499
2500             regnode *noper   = NEXTOPER( cur );
2501             const U8 *uc     = (U8*)STRING( noper );
2502             const U8 *e      = uc + STR_LEN( noper );
2503
2504             U32 state        = 1;         /* required init */
2505
2506             U16 charid       = 0;         /* sanity init */
2507             U32 accept_state = 0;         /* sanity init */
2508
2509             U32 wordlen      = 0;         /* required init */
2510
2511             if (OP(noper) == NOTHING) {
2512                 regnode *noper_next= regnext(noper);
2513                 if (noper_next != tail && OP(noper_next) == flags) {
2514                     noper = noper_next;
2515                     uc= (U8*)STRING(noper);
2516                     e= uc + STR_LEN(noper);
2517                 }
2518             }
2519
2520             if ( OP(noper) != NOTHING ) {
2521                 for ( ; uc < e ; uc += len ) {
2522
2523                     TRIE_READ_CHAR;
2524
2525                     if ( uvc < 256 ) {
2526                         charid = trie->charmap[ uvc ];
2527                     } else {
2528                         SV* const * const svpp = hv_fetch( widecharmap,
2529                                                            (char*)&uvc,
2530                                                            sizeof( UV ),
2531                                                            0);
2532                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2533                     }
2534                     if ( charid ) {
2535                         charid--;
2536                         if ( !trie->trans[ state + charid ].next ) {
2537                             trie->trans[ state + charid ].next = next_alloc;
2538                             trie->trans[ state ].check++;
2539                             prev_states[TRIE_NODENUM(next_alloc)]
2540                                     = TRIE_NODENUM(state);
2541                             next_alloc += trie->uniquecharcount;
2542                         }
2543                         state = trie->trans[ state + charid ].next;
2544                     } else {
2545                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2546                     }
2547                     /* charid is now 0 if we dont know the char read, or
2548                      * nonzero if we do */
2549                 }
2550             }
2551             accept_state = TRIE_NODENUM( state );
2552             TRIE_HANDLE_WORD(accept_state);
2553
2554         } /* end second pass */
2555
2556         /* and now dump it out before we compress it */
2557         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2558                                                           revcharmap,
2559                                                           next_alloc, depth+1));
2560
2561         {
2562         /*
2563            * Inplace compress the table.*
2564
2565            For sparse data sets the table constructed by the trie algorithm will
2566            be mostly 0/FAIL transitions or to put it another way mostly empty.
2567            (Note that leaf nodes will not contain any transitions.)
2568
2569            This algorithm compresses the tables by eliminating most such
2570            transitions, at the cost of a modest bit of extra work during lookup:
2571
2572            - Each states[] entry contains a .base field which indicates the
2573            index in the state[] array wheres its transition data is stored.
2574
2575            - If .base is 0 there are no valid transitions from that node.
2576
2577            - If .base is nonzero then charid is added to it to find an entry in
2578            the trans array.
2579
2580            -If trans[states[state].base+charid].check!=state then the
2581            transition is taken to be a 0/Fail transition. Thus if there are fail
2582            transitions at the front of the node then the .base offset will point
2583            somewhere inside the previous nodes data (or maybe even into a node
2584            even earlier), but the .check field determines if the transition is
2585            valid.
2586
2587            XXX - wrong maybe?
2588            The following process inplace converts the table to the compressed
2589            table: We first do not compress the root node 1,and mark all its
2590            .check pointers as 1 and set its .base pointer as 1 as well. This
2591            allows us to do a DFA construction from the compressed table later,
2592            and ensures that any .base pointers we calculate later are greater
2593            than 0.
2594
2595            - We set 'pos' to indicate the first entry of the second node.
2596
2597            - We then iterate over the columns of the node, finding the first and
2598            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2599            and set the .check pointers accordingly, and advance pos
2600            appropriately and repreat for the next node. Note that when we copy
2601            the next pointers we have to convert them from the original
2602            NODEIDX form to NODENUM form as the former is not valid post
2603            compression.
2604
2605            - If a node has no transitions used we mark its base as 0 and do not
2606            advance the pos pointer.
2607
2608            - If a node only has one transition we use a second pointer into the
2609            structure to fill in allocated fail transitions from other states.
2610            This pointer is independent of the main pointer and scans forward
2611            looking for null transitions that are allocated to a state. When it
2612            finds one it writes the single transition into the "hole".  If the
2613            pointer doesnt find one the single transition is appended as normal.
2614
2615            - Once compressed we can Renew/realloc the structures to release the
2616            excess space.
2617
2618            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2619            specifically Fig 3.47 and the associated pseudocode.
2620
2621            demq
2622         */
2623         const U32 laststate = TRIE_NODENUM( next_alloc );
2624         U32 state, charid;
2625         U32 pos = 0, zp=0;
2626         trie->statecount = laststate;
2627
2628         for ( state = 1 ; state < laststate ; state++ ) {
2629             U8 flag = 0;
2630             const U32 stateidx = TRIE_NODEIDX( state );
2631             const U32 o_used = trie->trans[ stateidx ].check;
2632             U32 used = trie->trans[ stateidx ].check;
2633             trie->trans[ stateidx ].check = 0;
2634
2635             for ( charid = 0;
2636                   used && charid < trie->uniquecharcount;
2637                   charid++ )
2638             {
2639                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2640                     if ( trie->trans[ stateidx + charid ].next ) {
2641                         if (o_used == 1) {
2642                             for ( ; zp < pos ; zp++ ) {
2643                                 if ( ! trie->trans[ zp ].next ) {
2644                                     break;
2645                                 }
2646                             }
2647                             trie->states[ state ].trans.base
2648                                                     = zp
2649                                                       + trie->uniquecharcount
2650                                                       - charid ;
2651                             trie->trans[ zp ].next
2652                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2653                                                              + charid ].next );
2654                             trie->trans[ zp ].check = state;
2655                             if ( ++zp > pos ) pos = zp;
2656                             break;
2657                         }
2658                         used--;
2659                     }
2660                     if ( !flag ) {
2661                         flag = 1;
2662                         trie->states[ state ].trans.base
2663                                        = pos + trie->uniquecharcount - charid ;
2664                     }
2665                     trie->trans[ pos ].next
2666                         = SAFE_TRIE_NODENUM(
2667                                        trie->trans[ stateidx + charid ].next );
2668                     trie->trans[ pos ].check = state;
2669                     pos++;
2670                 }
2671             }
2672         }
2673         trie->lasttrans = pos + 1;
2674         trie->states = (reg_trie_state *)
2675             PerlMemShared_realloc( trie->states, laststate
2676                                    * sizeof(reg_trie_state) );
2677         DEBUG_TRIE_COMPILE_MORE_r(
2678             PerlIO_printf( Perl_debug_log,
2679                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2680                 (int)depth * 2 + 2,"",
2681                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2682                        + 1 ),
2683                 (IV)next_alloc,
2684                 (IV)pos,
2685                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2686             );
2687
2688         } /* end table compress */
2689     }
2690     DEBUG_TRIE_COMPILE_MORE_r(
2691             PerlIO_printf(Perl_debug_log,
2692                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2693                 (int)depth * 2 + 2, "",
2694                 (UV)trie->statecount,
2695                 (UV)trie->lasttrans)
2696     );
2697     /* resize the trans array to remove unused space */
2698     trie->trans = (reg_trie_trans *)
2699         PerlMemShared_realloc( trie->trans, trie->lasttrans
2700                                * sizeof(reg_trie_trans) );
2701
2702     {   /* Modify the program and insert the new TRIE node */
2703         U8 nodetype =(U8)(flags & 0xFF);
2704         char *str=NULL;
2705
2706 #ifdef DEBUGGING
2707         regnode *optimize = NULL;
2708 #ifdef RE_TRACK_PATTERN_OFFSETS
2709
2710         U32 mjd_offset = 0;
2711         U32 mjd_nodelen = 0;
2712 #endif /* RE_TRACK_PATTERN_OFFSETS */
2713 #endif /* DEBUGGING */
2714         /*
2715            This means we convert either the first branch or the first Exact,
2716            depending on whether the thing following (in 'last') is a branch
2717            or not and whther first is the startbranch (ie is it a sub part of
2718            the alternation or is it the whole thing.)
2719            Assuming its a sub part we convert the EXACT otherwise we convert
2720            the whole branch sequence, including the first.
2721          */
2722         /* Find the node we are going to overwrite */
2723         if ( first != startbranch || OP( last ) == BRANCH ) {
2724             /* branch sub-chain */
2725             NEXT_OFF( first ) = (U16)(last - first);
2726 #ifdef RE_TRACK_PATTERN_OFFSETS
2727             DEBUG_r({
2728                 mjd_offset= Node_Offset((convert));
2729                 mjd_nodelen= Node_Length((convert));
2730             });
2731 #endif
2732             /* whole branch chain */
2733         }
2734 #ifdef RE_TRACK_PATTERN_OFFSETS
2735         else {
2736             DEBUG_r({
2737                 const  regnode *nop = NEXTOPER( convert );
2738                 mjd_offset= Node_Offset((nop));
2739                 mjd_nodelen= Node_Length((nop));
2740             });
2741         }
2742         DEBUG_OPTIMISE_r(
2743             PerlIO_printf(Perl_debug_log,
2744                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2745                 (int)depth * 2 + 2, "",
2746                 (UV)mjd_offset, (UV)mjd_nodelen)
2747         );
2748 #endif
2749         /* But first we check to see if there is a common prefix we can
2750            split out as an EXACT and put in front of the TRIE node.  */
2751         trie->startstate= 1;
2752         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2753             U32 state;
2754             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2755                 U32 ofs = 0;
2756                 I32 idx = -1;
2757                 U32 count = 0;
2758                 const U32 base = trie->states[ state ].trans.base;
2759
2760                 if ( trie->states[state].wordnum )
2761                         count = 1;
2762
2763                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2764                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2765                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2766                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2767                     {
2768                         if ( ++count > 1 ) {
2769                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2770                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2771                             if ( state == 1 ) break;
2772                             if ( count == 2 ) {
2773                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2774                                 DEBUG_OPTIMISE_r(
2775                                     PerlIO_printf(Perl_debug_log,
2776                                         "%*sNew Start State=%"UVuf" Class: [",
2777                                         (int)depth * 2 + 2, "",
2778                                         (UV)state));
2779                                 if (idx >= 0) {
2780                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2781                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2782
2783                                     TRIE_BITMAP_SET(trie,*ch);
2784                                     if ( folder )
2785                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2786                                     DEBUG_OPTIMISE_r(
2787                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2788                                     );
2789                                 }
2790                             }
2791                             TRIE_BITMAP_SET(trie,*ch);
2792                             if ( folder )
2793                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2794                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2795                         }
2796                         idx = ofs;
2797                     }
2798                 }
2799                 if ( count == 1 ) {
2800                     SV **tmp = av_fetch( revcharmap, idx, 0);
2801                     STRLEN len;
2802                     char *ch = SvPV( *tmp, len );
2803                     DEBUG_OPTIMISE_r({
2804                         SV *sv=sv_newmortal();
2805                         PerlIO_printf( Perl_debug_log,
2806                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2807                             (int)depth * 2 + 2, "",
2808                             (UV)state, (UV)idx,
2809                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2810                                 PL_colors[0], PL_colors[1],
2811                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2812                                 PERL_PV_ESCAPE_FIRSTCHAR
2813                             )
2814                         );
2815                     });
2816                     if ( state==1 ) {
2817                         OP( convert ) = nodetype;
2818                         str=STRING(convert);
2819                         STR_LEN(convert)=0;
2820                     }
2821                     STR_LEN(convert) += len;
2822                     while (len--)
2823                         *str++ = *ch++;
2824                 } else {
2825 #ifdef DEBUGGING
2826                     if (state>1)
2827                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2828 #endif
2829                     break;
2830                 }
2831             }
2832             trie->prefixlen = (state-1);
2833             if (str) {
2834                 regnode *n = convert+NODE_SZ_STR(convert);
2835                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2836                 trie->startstate = state;
2837                 trie->minlen -= (state - 1);
2838                 trie->maxlen -= (state - 1);
2839 #ifdef DEBUGGING
2840                /* At least the UNICOS C compiler choked on this
2841                 * being argument to DEBUG_r(), so let's just have
2842                 * it right here. */
2843                if (
2844 #ifdef PERL_EXT_RE_BUILD
2845                    1
2846 #else
2847                    DEBUG_r_TEST
2848 #endif
2849                    ) {
2850                    regnode *fix = convert;
2851                    U32 word = trie->wordcount;
2852                    mjd_nodelen++;
2853                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2854                    while( ++fix < n ) {
2855                        Set_Node_Offset_Length(fix, 0, 0);
2856                    }
2857                    while (word--) {
2858                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2859                        if (tmp) {
2860                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2861                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2862                            else
2863                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2864                        }
2865                    }
2866                }
2867 #endif
2868                 if (trie->maxlen) {
2869                     convert = n;
2870                 } else {
2871                     NEXT_OFF(convert) = (U16)(tail - convert);
2872                     DEBUG_r(optimize= n);
2873                 }
2874             }
2875         }
2876         if (!jumper)
2877             jumper = last;
2878         if ( trie->maxlen ) {
2879             NEXT_OFF( convert ) = (U16)(tail - convert);
2880             ARG_SET( convert, data_slot );
2881             /* Store the offset to the first unabsorbed branch in
2882                jump[0], which is otherwise unused by the jump logic.
2883                We use this when dumping a trie and during optimisation. */
2884             if (trie->jump)
2885                 trie->jump[0] = (U16)(nextbranch - convert);
2886
2887             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2888              *   and there is a bitmap
2889              *   and the first "jump target" node we found leaves enough room
2890              * then convert the TRIE node into a TRIEC node, with the bitmap
2891              * embedded inline in the opcode - this is hypothetically faster.
2892              */
2893             if ( !trie->states[trie->startstate].wordnum
2894                  && trie->bitmap
2895                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2896             {
2897                 OP( convert ) = TRIEC;
2898                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2899                 PerlMemShared_free(trie->bitmap);
2900                 trie->bitmap= NULL;
2901             } else
2902                 OP( convert ) = TRIE;
2903
2904             /* store the type in the flags */
2905             convert->flags = nodetype;
2906             DEBUG_r({
2907             optimize = convert
2908                       + NODE_STEP_REGNODE
2909                       + regarglen[ OP( convert ) ];
2910             });
2911             /* XXX We really should free up the resource in trie now,
2912                    as we won't use them - (which resources?) dmq */
2913         }
2914         /* needed for dumping*/
2915         DEBUG_r(if (optimize) {
2916             regnode *opt = convert;
2917
2918             while ( ++opt < optimize) {
2919                 Set_Node_Offset_Length(opt,0,0);
2920             }
2921             /*
2922                 Try to clean up some of the debris left after the
2923                 optimisation.
2924              */
2925             while( optimize < jumper ) {
2926                 mjd_nodelen += Node_Length((optimize));
2927                 OP( optimize ) = OPTIMIZED;
2928                 Set_Node_Offset_Length(optimize,0,0);
2929                 optimize++;
2930             }
2931             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2932         });
2933     } /* end node insert */
2934
2935     /*  Finish populating the prev field of the wordinfo array.  Walk back
2936      *  from each accept state until we find another accept state, and if
2937      *  so, point the first word's .prev field at the second word. If the
2938      *  second already has a .prev field set, stop now. This will be the
2939      *  case either if we've already processed that word's accept state,
2940      *  or that state had multiple words, and the overspill words were
2941      *  already linked up earlier.
2942      */
2943     {
2944         U16 word;
2945         U32 state;
2946         U16 prev;
2947
2948         for (word=1; word <= trie->wordcount; word++) {
2949             prev = 0;
2950             if (trie->wordinfo[word].prev)
2951                 continue;
2952             state = trie->wordinfo[word].accept;
2953             while (state) {
2954                 state = prev_states[state];
2955                 if (!state)
2956                     break;
2957                 prev = trie->states[state].wordnum;
2958                 if (prev)
2959                     break;
2960             }
2961             trie->wordinfo[word].prev = prev;
2962         }
2963         Safefree(prev_states);
2964     }
2965
2966
2967     /* and now dump out the compressed format */
2968     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2969
2970     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2971 #ifdef DEBUGGING
2972     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2973     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2974 #else
2975     SvREFCNT_dec_NN(revcharmap);
2976 #endif
2977     return trie->jump
2978            ? MADE_JUMP_TRIE
2979            : trie->startstate>1
2980              ? MADE_EXACT_TRIE
2981              : MADE_TRIE;
2982 }
2983
2984 STATIC void
2985 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2986 {
2987 /* The Trie is constructed and compressed now so we can build a fail array if
2988  * it's needed
2989
2990    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2991    3.32 in the
2992    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2993    Ullman 1985/88
2994    ISBN 0-201-10088-6
2995
2996    We find the fail state for each state in the trie, this state is the longest
2997    proper suffix of the current state's 'word' that is also a proper prefix of
2998    another word in our trie. State 1 represents the word '' and is thus the
2999    default fail state. This allows the DFA not to have to restart after its
3000    tried and failed a word at a given point, it simply continues as though it
3001    had been matching the other word in the first place.
3002    Consider
3003       'abcdgu'=~/abcdefg|cdgu/
3004    When we get to 'd' we are still matching the first word, we would encounter
3005    'g' which would fail, which would bring us to the state representing 'd' in
3006    the second word where we would try 'g' and succeed, proceeding to match
3007    'cdgu'.
3008  */
3009  /* add a fail transition */
3010     const U32 trie_offset = ARG(source);
3011     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3012     U32 *q;
3013     const U32 ucharcount = trie->uniquecharcount;
3014     const U32 numstates = trie->statecount;
3015     const U32 ubound = trie->lasttrans + ucharcount;
3016     U32 q_read = 0;
3017     U32 q_write = 0;
3018     U32 charid;
3019     U32 base = trie->states[ 1 ].trans.base;
3020     U32 *fail;
3021     reg_ac_data *aho;
3022     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3023     GET_RE_DEBUG_FLAGS_DECL;
3024
3025     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3026 #ifndef DEBUGGING
3027     PERL_UNUSED_ARG(depth);
3028 #endif
3029
3030
3031     ARG_SET( stclass, data_slot );
3032     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3033     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3034     aho->trie=trie_offset;
3035     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3036     Copy( trie->states, aho->states, numstates, reg_trie_state );
3037     Newxz( q, numstates, U32);
3038     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3039     aho->refcount = 1;
3040     fail = aho->fail;
3041     /* initialize fail[0..1] to be 1 so that we always have
3042        a valid final fail state */
3043     fail[ 0 ] = fail[ 1 ] = 1;
3044
3045     for ( charid = 0; charid < ucharcount ; charid++ ) {
3046         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3047         if ( newstate ) {
3048             q[ q_write ] = newstate;
3049             /* set to point at the root */
3050             fail[ q[ q_write++ ] ]=1;
3051         }
3052     }
3053     while ( q_read < q_write) {
3054         const U32 cur = q[ q_read++ % numstates ];
3055         base = trie->states[ cur ].trans.base;
3056
3057         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3058             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3059             if (ch_state) {
3060                 U32 fail_state = cur;
3061                 U32 fail_base;
3062                 do {
3063                     fail_state = fail[ fail_state ];
3064                     fail_base = aho->states[ fail_state ].trans.base;
3065                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3066
3067                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3068                 fail[ ch_state ] = fail_state;
3069                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3070                 {
3071                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3072                 }
3073                 q[ q_write++ % numstates] = ch_state;
3074             }
3075         }
3076     }
3077     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3078        when we fail in state 1, this allows us to use the
3079        charclass scan to find a valid start char. This is based on the principle
3080        that theres a good chance the string being searched contains lots of stuff
3081        that cant be a start char.
3082      */
3083     fail[ 0 ] = fail[ 1 ] = 0;
3084     DEBUG_TRIE_COMPILE_r({
3085         PerlIO_printf(Perl_debug_log,
3086                       "%*sStclass Failtable (%"UVuf" states): 0",
3087                       (int)(depth * 2), "", (UV)numstates
3088         );
3089         for( q_read=1; q_read<numstates; q_read++ ) {
3090             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3091         }
3092         PerlIO_printf(Perl_debug_log, "\n");
3093     });
3094     Safefree(q);
3095     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3096 }
3097
3098
3099 #define DEBUG_PEEP(str,scan,depth) \
3100     DEBUG_OPTIMISE_r({if (scan){ \
3101        SV * const mysv=sv_newmortal(); \
3102        regnode *Next = regnext(scan); \
3103        regprop(RExC_rx, mysv, scan, NULL); \
3104        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3105        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3106        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3107    }});
3108
3109
3110 /* The below joins as many adjacent EXACTish nodes as possible into a single
3111  * one.  The regop may be changed if the node(s) contain certain sequences that
3112  * require special handling.  The joining is only done if:
3113  * 1) there is room in the current conglomerated node to entirely contain the
3114  *    next one.
3115  * 2) they are the exact same node type
3116  *
3117  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3118  * these get optimized out
3119  *
3120  * If a node is to match under /i (folded), the number of characters it matches
3121  * can be different than its character length if it contains a multi-character
3122  * fold.  *min_subtract is set to the total delta number of characters of the
3123  * input nodes.
3124  *
3125  * And *unfolded_multi_char is set to indicate whether or not the node contains
3126  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3127  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3128  * SMALL LETTER SHARP S, as only if the target string being matched against
3129  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3130  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3131  * whose components are all above the Latin1 range are not run-time locale
3132  * dependent, and have already been folded by the time this function is
3133  * called.)
3134  *
3135  * This is as good a place as any to discuss the design of handling these
3136  * multi-character fold sequences.  It's been wrong in Perl for a very long
3137  * time.  There are three code points in Unicode whose multi-character folds
3138  * were long ago discovered to mess things up.  The previous designs for
3139  * dealing with these involved assigning a special node for them.  This
3140  * approach doesn't always work, as evidenced by this example:
3141  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3142  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3143  * would match just the \xDF, it won't be able to handle the case where a
3144  * successful match would have to cross the node's boundary.  The new approach
3145  * that hopefully generally solves the problem generates an EXACTFU_SS node
3146  * that is "sss" in this case.
3147  *
3148  * It turns out that there are problems with all multi-character folds, and not
3149  * just these three.  Now the code is general, for all such cases.  The
3150  * approach taken is:
3151  * 1)   This routine examines each EXACTFish node that could contain multi-
3152  *      character folded sequences.  Since a single character can fold into
3153  *      such a sequence, the minimum match length for this node is less than
3154  *      the number of characters in the node.  This routine returns in
3155  *      *min_subtract how many characters to subtract from the the actual
3156  *      length of the string to get a real minimum match length; it is 0 if
3157  *      there are no multi-char foldeds.  This delta is used by the caller to
3158  *      adjust the min length of the match, and the delta between min and max,
3159  *      so that the optimizer doesn't reject these possibilities based on size
3160  *      constraints.
3161  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3162  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3163  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3164  *      there is a possible fold length change.  That means that a regular
3165  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3166  *      with length changes, and so can be processed faster.  regexec.c takes
3167  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3168  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3169  *      known until runtime).  This saves effort in regex matching.  However,
3170  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3171  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3172  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3173  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3174  *      possibilities for the non-UTF8 patterns are quite simple, except for
3175  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3176  *      members of a fold-pair, and arrays are set up for all of them so that
3177  *      the other member of the pair can be found quickly.  Code elsewhere in
3178  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3179  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3180  *      described in the next item.
3181  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3182  *      validity of the fold won't be known until runtime, and so must remain
3183  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3184  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3185  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3186  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3187  *      The reason this is a problem is that the optimizer part of regexec.c
3188  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3189  *      that a character in the pattern corresponds to at most a single
3190  *      character in the target string.  (And I do mean character, and not byte
3191  *      here, unlike other parts of the documentation that have never been
3192  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3193  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3194  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3195  *      nodes, violate the assumption, and they are the only instances where it
3196  *      is violated.  I'm reluctant to try to change the assumption, as the
3197  *      code involved is impenetrable to me (khw), so instead the code here
3198  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3199  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3200  *      boolean indicating whether or not the node contains such a fold.  When
3201  *      it is true, the caller sets a flag that later causes the optimizer in
3202  *      this file to not set values for the floating and fixed string lengths,
3203  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3204  *      assumption.  Thus, there is no optimization based on string lengths for
3205  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3206  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3207  *      assumption is wrong only in these cases is that all other non-UTF-8
3208  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3209  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3210  *      EXACTF nodes because we don't know at compile time if it actually
3211  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3212  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3213  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3214  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3215  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3216  *      string would require the pattern to be forced into UTF-8, the overhead
3217  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3218  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3219  *      locale.)
3220  *
3221  *      Similarly, the code that generates tries doesn't currently handle
3222  *      not-already-folded multi-char folds, and it looks like a pain to change
3223  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3224  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3225  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3226  *      using /iaa matching will be doing so almost entirely with ASCII
3227  *      strings, so this should rarely be encountered in practice */
3228
3229 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3230     if (PL_regkind[OP(scan)] == EXACT) \
3231         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3232
3233 STATIC U32
3234 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3235                    UV *min_subtract, bool *unfolded_multi_char,
3236                    U32 flags,regnode *val, U32 depth)
3237 {
3238     /* Merge several consecutive EXACTish nodes into one. */
3239     regnode *n = regnext(scan);
3240     U32 stringok = 1;
3241     regnode *next = scan + NODE_SZ_STR(scan);
3242     U32 merged = 0;
3243     U32 stopnow = 0;
3244 #ifdef DEBUGGING
3245     regnode *stop = scan;
3246     GET_RE_DEBUG_FLAGS_DECL;
3247 #else
3248     PERL_UNUSED_ARG(depth);
3249 #endif
3250
3251     PERL_ARGS_ASSERT_JOIN_EXACT;
3252 #ifndef EXPERIMENTAL_INPLACESCAN
3253     PERL_UNUSED_ARG(flags);
3254     PERL_UNUSED_ARG(val);
3255 #endif
3256     DEBUG_PEEP("join",scan,depth);
3257
3258     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3259      * EXACT ones that are mergeable to the current one. */
3260     while (n
3261            && (PL_regkind[OP(n)] == NOTHING
3262                || (stringok && OP(n) == OP(scan)))
3263            && NEXT_OFF(n)
3264            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3265     {
3266
3267         if (OP(n) == TAIL || n > next)
3268             stringok = 0;
3269         if (PL_regkind[OP(n)] == NOTHING) {
3270             DEBUG_PEEP("skip:",n,depth);
3271             NEXT_OFF(scan) += NEXT_OFF(n);
3272             next = n + NODE_STEP_REGNODE;
3273 #ifdef DEBUGGING
3274             if (stringok)
3275                 stop = n;
3276 #endif
3277             n = regnext(n);
3278         }
3279         else if (stringok) {
3280             const unsigned int oldl = STR_LEN(scan);
3281             regnode * const nnext = regnext(n);
3282
3283             /* XXX I (khw) kind of doubt that this works on platforms (should
3284              * Perl ever run on one) where U8_MAX is above 255 because of lots
3285              * of other assumptions */
3286             /* Don't join if the sum can't fit into a single node */
3287             if (oldl + STR_LEN(n) > U8_MAX)
3288                 break;
3289
3290             DEBUG_PEEP("merg",n,depth);
3291             merged++;
3292
3293             NEXT_OFF(scan) += NEXT_OFF(n);
3294             STR_LEN(scan) += STR_LEN(n);
3295             next = n + NODE_SZ_STR(n);
3296             /* Now we can overwrite *n : */
3297             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3298 #ifdef DEBUGGING
3299             stop = next - 1;
3300 #endif
3301             n = nnext;
3302             if (stopnow) break;
3303         }
3304
3305 #ifdef EXPERIMENTAL_INPLACESCAN
3306         if (flags && !NEXT_OFF(n)) {
3307             DEBUG_PEEP("atch", val, depth);
3308             if (reg_off_by_arg[OP(n)]) {
3309                 ARG_SET(n, val - n);
3310             }
3311             else {
3312                 NEXT_OFF(n) = val - n;
3313             }
3314             stopnow = 1;
3315         }
3316 #endif
3317     }
3318
3319     *min_subtract = 0;
3320     *unfolded_multi_char = FALSE;
3321
3322     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3323      * can now analyze for sequences of problematic code points.  (Prior to
3324      * this final joining, sequences could have been split over boundaries, and
3325      * hence missed).  The sequences only happen in folding, hence for any
3326      * non-EXACT EXACTish node */
3327     if (OP(scan) != EXACT) {
3328         U8* s0 = (U8*) STRING(scan);
3329         U8* s = s0;
3330         U8* s_end = s0 + STR_LEN(scan);
3331
3332         int total_count_delta = 0;  /* Total delta number of characters that
3333                                        multi-char folds expand to */
3334
3335         /* One pass is made over the node's string looking for all the
3336          * possibilities.  To avoid some tests in the loop, there are two main
3337          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3338          * non-UTF-8 */
3339         if (UTF) {
3340             U8* folded = NULL;
3341
3342             if (OP(scan) == EXACTFL) {
3343                 U8 *d;
3344
3345                 /* An EXACTFL node would already have been changed to another
3346                  * node type unless there is at least one character in it that
3347                  * is problematic; likely a character whose fold definition
3348                  * won't be known until runtime, and so has yet to be folded.
3349                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3350                  * to handle the UTF-8 case, we need to create a temporary
3351                  * folded copy using UTF-8 locale rules in order to analyze it.
3352                  * This is because our macros that look to see if a sequence is
3353                  * a multi-char fold assume everything is folded (otherwise the
3354                  * tests in those macros would be too complicated and slow).
3355                  * Note that here, the non-problematic folds will have already
3356                  * been done, so we can just copy such characters.  We actually
3357                  * don't completely fold the EXACTFL string.  We skip the
3358                  * unfolded multi-char folds, as that would just create work
3359                  * below to figure out the size they already are */
3360
3361                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3362                 d = folded;
3363                 while (s < s_end) {
3364                     STRLEN s_len = UTF8SKIP(s);
3365                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3366                         Copy(s, d, s_len, U8);
3367                         d += s_len;
3368                     }
3369                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3370                         *unfolded_multi_char = TRUE;
3371                         Copy(s, d, s_len, U8);
3372                         d += s_len;
3373                     }
3374                     else if (isASCII(*s)) {
3375                         *(d++) = toFOLD(*s);
3376                     }
3377                     else {
3378                         STRLEN len;
3379                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3380                         d += len;
3381                     }
3382                     s += s_len;
3383                 }
3384
3385                 /* Point the remainder of the routine to look at our temporary
3386                  * folded copy */
3387                 s = folded;
3388                 s_end = d;
3389             } /* End of creating folded copy of EXACTFL string */
3390
3391             /* Examine the string for a multi-character fold sequence.  UTF-8
3392              * patterns have all characters pre-folded by the time this code is
3393              * executed */
3394             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3395                                      length sequence we are looking for is 2 */
3396             {
3397                 int count = 0;  /* How many characters in a multi-char fold */
3398                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3399                 if (! len) {    /* Not a multi-char fold: get next char */
3400                     s += UTF8SKIP(s);
3401                     continue;
3402                 }
3403
3404                 /* Nodes with 'ss' require special handling, except for
3405                  * EXACTFA-ish for which there is no multi-char fold to this */
3406                 if (len == 2 && *s == 's' && *(s+1) == 's'
3407                     && OP(scan) != EXACTFA
3408                     && OP(scan) != EXACTFA_NO_TRIE)
3409                 {
3410                     count = 2;
3411                     if (OP(scan) != EXACTFL) {
3412                         OP(scan) = EXACTFU_SS;
3413                     }
3414                     s += 2;
3415                 }
3416                 else { /* Here is a generic multi-char fold. */
3417                     U8* multi_end  = s + len;
3418
3419                     /* Count how many characters in it.  In the case of /aa, no
3420                      * folds which contain ASCII code points are allowed, so
3421                      * check for those, and skip if found. */
3422                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3423                         count = utf8_length(s, multi_end);
3424                         s = multi_end;
3425                     }
3426                     else {
3427                         while (s < multi_end) {
3428                             if (isASCII(*s)) {
3429                                 s++;
3430                                 goto next_iteration;
3431                             }
3432                             else {
3433                                 s += UTF8SKIP(s);
3434                             }
3435                             count++;
3436                         }
3437                     }
3438                 }
3439
3440                 /* The delta is how long the sequence is minus 1 (1 is how long
3441                  * the character that folds to the sequence is) */
3442                 total_count_delta += count - 1;
3443               next_iteration: ;
3444             }
3445
3446             /* We created a temporary folded copy of the string in EXACTFL
3447              * nodes.  Therefore we need to be sure it doesn't go below zero,
3448              * as the real string could be shorter */
3449             if (OP(scan) == EXACTFL) {
3450                 int total_chars = utf8_length((U8*) STRING(scan),
3451                                            (U8*) STRING(scan) + STR_LEN(scan));
3452                 if (total_count_delta > total_chars) {
3453                     total_count_delta = total_chars;
3454                 }
3455             }
3456
3457             *min_subtract += total_count_delta;
3458             Safefree(folded);
3459         }
3460         else if (OP(scan) == EXACTFA) {
3461
3462             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3463              * fold to the ASCII range (and there are no existing ones in the
3464              * upper latin1 range).  But, as outlined in the comments preceding
3465              * this function, we need to flag any occurrences of the sharp s.
3466              * This character forbids trie formation (because of added
3467              * complexity) */
3468             while (s < s_end) {
3469                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3470                     OP(scan) = EXACTFA_NO_TRIE;
3471                     *unfolded_multi_char = TRUE;
3472                     break;
3473                 }
3474                 s++;
3475                 continue;
3476             }
3477         }
3478         else {
3479
3480             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3481              * folds that are all Latin1.  As explained in the comments
3482              * preceding this function, we look also for the sharp s in EXACTF
3483              * and EXACTFL nodes; it can be in the final position.  Otherwise
3484              * we can stop looking 1 byte earlier because have to find at least
3485              * two characters for a multi-fold */
3486             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3487                               ? s_end
3488                               : s_end -1;
3489
3490             while (s < upper) {
3491                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3492                 if (! len) {    /* Not a multi-char fold. */
3493                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3494                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3495                     {
3496                         *unfolded_multi_char = TRUE;
3497                     }
3498                     s++;
3499                     continue;
3500                 }
3501
3502                 if (len == 2
3503                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3504                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3505                 {
3506
3507                     /* EXACTF nodes need to know that the minimum length
3508                      * changed so that a sharp s in the string can match this
3509                      * ss in the pattern, but they remain EXACTF nodes, as they
3510                      * won't match this unless the target string is is UTF-8,
3511                      * which we don't know until runtime.  EXACTFL nodes can't
3512                      * transform into EXACTFU nodes */
3513                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3514                         OP(scan) = EXACTFU_SS;
3515                     }
3516                 }
3517
3518                 *min_subtract += len - 1;
3519                 s += len;
3520             }
3521         }
3522     }
3523
3524 #ifdef DEBUGGING
3525     /* Allow dumping but overwriting the collection of skipped
3526      * ops and/or strings with fake optimized ops */
3527     n = scan + NODE_SZ_STR(scan);
3528     while (n <= stop) {
3529         OP(n) = OPTIMIZED;
3530         FLAGS(n) = 0;
3531         NEXT_OFF(n) = 0;
3532         n++;
3533     }
3534 #endif
3535     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3536     return stopnow;
3537 }
3538
3539 /* REx optimizer.  Converts nodes into quicker variants "in place".
3540    Finds fixed substrings.  */
3541
3542 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3543    to the position after last scanned or to NULL. */
3544
3545 #define INIT_AND_WITHP \
3546     assert(!and_withp); \
3547     Newx(and_withp,1, regnode_ssc); \
3548     SAVEFREEPV(and_withp)
3549
3550 /* this is a chain of data about sub patterns we are processing that
3551    need to be handled separately/specially in study_chunk. Its so
3552    we can simulate recursion without losing state.  */
3553 struct scan_frame;
3554 typedef struct scan_frame {
3555     regnode *last;  /* last node to process in this frame */
3556     regnode *next;  /* next node to process when last is reached */
3557     struct scan_frame *prev; /*previous frame*/
3558     U32 prev_recursed_depth;
3559     I32 stop; /* what stopparen do we use */
3560 } scan_frame;
3561
3562
3563 STATIC SSize_t
3564 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3565                         SSize_t *minlenp, SSize_t *deltap,
3566                         regnode *last,
3567                         scan_data_t *data,
3568                         I32 stopparen,
3569                         U32 recursed_depth,
3570                         regnode_ssc *and_withp,
3571                         U32 flags, U32 depth)
3572                         /* scanp: Start here (read-write). */
3573                         /* deltap: Write maxlen-minlen here. */
3574                         /* last: Stop before this one. */
3575                         /* data: string data about the pattern */
3576                         /* stopparen: treat close N as END */
3577                         /* recursed: which subroutines have we recursed into */
3578                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3579 {
3580     dVAR;
3581     /* There must be at least this number of characters to match */
3582     SSize_t min = 0;
3583     I32 pars = 0, code;
3584     regnode *scan = *scanp, *next;
3585     SSize_t delta = 0;
3586     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3587     int is_inf_internal = 0;            /* The studied chunk is infinite */
3588     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3589     scan_data_t data_fake;
3590     SV *re_trie_maxbuff = NULL;
3591     regnode *first_non_open = scan;
3592     SSize_t stopmin = SSize_t_MAX;
3593     scan_frame *frame = NULL;
3594     GET_RE_DEBUG_FLAGS_DECL;
3595
3596     PERL_ARGS_ASSERT_STUDY_CHUNK;
3597
3598 #ifdef DEBUGGING
3599     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3600 #endif
3601     if ( depth == 0 ) {
3602         while (first_non_open && OP(first_non_open) == OPEN)
3603             first_non_open=regnext(first_non_open);
3604     }
3605
3606
3607   fake_study_recurse:
3608     while ( scan && OP(scan) != END && scan < last ){
3609         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3610                                    node length to get a real minimum (because
3611                                    the folded version may be shorter) */
3612         bool unfolded_multi_char = FALSE;
3613         /* Peephole optimizer: */
3614         DEBUG_OPTIMISE_MORE_r(
3615         {
3616             PerlIO_printf(Perl_debug_log,
3617                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3618                 ((int) depth*2), "", (long)stopparen,
3619                 (unsigned long)depth, (unsigned long)recursed_depth);
3620             if (recursed_depth) {
3621                 U32 i;
3622                 U32 j;
3623                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3624                     PerlIO_printf(Perl_debug_log,"[");
3625                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3626                         PerlIO_printf(Perl_debug_log,"%d",
3627                             PAREN_TEST(RExC_study_chunk_recursed +
3628                                        (j * RExC_study_chunk_recursed_bytes), i)
3629                             ? 1 : 0
3630                         );
3631                     PerlIO_printf(Perl_debug_log,"]");
3632                 }
3633             }
3634             PerlIO_printf(Perl_debug_log,"\n");
3635         }
3636         );
3637         DEBUG_STUDYDATA("Peep:", data, depth);
3638         DEBUG_PEEP("Peep", scan, depth);
3639
3640
3641         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3642          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3643          * by a different invocation of reg() -- Yves
3644          */
3645         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3646
3647         /* Follow the next-chain of the current node and optimize
3648            away all the NOTHINGs from it.  */
3649         if (OP(scan) != CURLYX) {
3650             const int max = (reg_off_by_arg[OP(scan)]
3651                        ? I32_MAX
3652                        /* I32 may be smaller than U16 on CRAYs! */
3653                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3654             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3655             int noff;
3656             regnode *n = scan;
3657
3658             /* Skip NOTHING and LONGJMP. */
3659             while ((n = regnext(n))
3660                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3661                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3662                    && off + noff < max)
3663                 off += noff;
3664             if (reg_off_by_arg[OP(scan)])
3665                 ARG(scan) = off;
3666             else
3667                 NEXT_OFF(scan) = off;
3668         }
3669
3670
3671
3672         /* The principal pseudo-switch.  Cannot be a switch, since we
3673            look into several different things.  */
3674         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3675                    || OP(scan) == IFTHEN) {
3676             next = regnext(scan);
3677             code = OP(scan);
3678             /* demq: the op(next)==code check is to see if we have
3679              * "branch-branch" AFAICT */
3680
3681             if (OP(next) == code || code == IFTHEN) {
3682                 /* NOTE - There is similar code to this block below for
3683                  * handling TRIE nodes on a re-study.  If you change stuff here
3684                  * check there too. */
3685                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3686                 regnode_ssc accum;
3687                 regnode * const startbranch=scan;
3688
3689                 if (flags & SCF_DO_SUBSTR) {
3690                     /* Cannot merge strings after this. */
3691                     scan_commit(pRExC_state, data, minlenp, is_inf);
3692                 }
3693
3694                 if (flags & SCF_DO_STCLASS)
3695                     ssc_init_zero(pRExC_state, &accum);
3696
3697                 while (OP(scan) == code) {
3698                     SSize_t deltanext, minnext, fake;
3699                     I32 f = 0;
3700                     regnode_ssc this_class;
3701
3702                     num++;
3703                     data_fake.flags = 0;
3704                     if (data) {
3705                         data_fake.whilem_c = data->whilem_c;
3706                         data_fake.last_closep = data->last_closep;
3707                     }
3708                     else
3709                         data_fake.last_closep = &fake;
3710
3711                     data_fake.pos_delta = delta;
3712                     next = regnext(scan);
3713                     scan = NEXTOPER(scan);
3714                     if (code != BRANCH)
3715                         scan = NEXTOPER(scan);
3716                     if (flags & SCF_DO_STCLASS) {
3717                         ssc_init(pRExC_state, &this_class);
3718                         data_fake.start_class = &this_class;
3719                         f = SCF_DO_STCLASS_AND;
3720                     }
3721                     if (flags & SCF_WHILEM_VISITED_POS)
3722                         f |= SCF_WHILEM_VISITED_POS;
3723
3724                     /* we suppose the run is continuous, last=next...*/
3725                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3726                                       &deltanext, next, &data_fake, stopparen,
3727                                       recursed_depth, NULL, f,depth+1);
3728                     if (min1 > minnext)
3729                         min1 = minnext;
3730                     if (deltanext == SSize_t_MAX) {
3731                         is_inf = is_inf_internal = 1;
3732                         max1 = SSize_t_MAX;
3733                     } else if (max1 < minnext + deltanext)
3734                         max1 = minnext + deltanext;
3735                     scan = next;
3736                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3737                         pars++;
3738                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3739                         if ( stopmin > minnext)
3740                             stopmin = min + min1;
3741                         flags &= ~SCF_DO_SUBSTR;
3742                         if (data)
3743                             data->flags |= SCF_SEEN_ACCEPT;
3744                     }
3745                     if (data) {
3746                         if (data_fake.flags & SF_HAS_EVAL)
3747                             data->flags |= SF_HAS_EVAL;
3748                         data->whilem_c = data_fake.whilem_c;
3749                     }
3750                     if (flags & SCF_DO_STCLASS)
3751                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3752                 }
3753                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3754                     min1 = 0;
3755                 if (flags & SCF_DO_SUBSTR) {
3756                     data->pos_min += min1;
3757                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3758                         data->pos_delta = SSize_t_MAX;
3759                     else
3760                         data->pos_delta += max1 - min1;
3761                     if (max1 != min1 || is_inf)
3762                         data->longest = &(data->longest_float);
3763                 }
3764                 min += min1;
3765                 if (delta == SSize_t_MAX
3766                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3767                     delta = SSize_t_MAX;
3768                 else
3769                     delta += max1 - min1;
3770                 if (flags & SCF_DO_STCLASS_OR) {
3771                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3772                     if (min1) {
3773                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3774                         flags &= ~SCF_DO_STCLASS;
3775                     }
3776                 }
3777                 else if (flags & SCF_DO_STCLASS_AND) {
3778                     if (min1) {
3779                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3780                         flags &= ~SCF_DO_STCLASS;
3781                     }
3782                     else {
3783                         /* Switch to OR mode: cache the old value of
3784                          * data->start_class */
3785                         INIT_AND_WITHP;
3786                         StructCopy(data->start_class, and_withp, regnode_ssc);
3787                         flags &= ~SCF_DO_STCLASS_AND;
3788                         StructCopy(&accum, data->start_class, regnode_ssc);
3789                         flags |= SCF_DO_STCLASS_OR;
3790                     }
3791                 }
3792
3793                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3794                         OP( startbranch ) == BRANCH )
3795                 {
3796                 /* demq.
3797
3798                    Assuming this was/is a branch we are dealing with: 'scan'
3799                    now points at the item that follows the branch sequence,
3800                    whatever it is. We now start at the beginning of the
3801                    sequence and look for subsequences of
3802
3803                    BRANCH->EXACT=>x1
3804                    BRANCH->EXACT=>x2
3805                    tail
3806
3807                    which would be constructed from a pattern like
3808                    /A|LIST|OF|WORDS/
3809
3810                    If we can find such a subsequence we need to turn the first
3811                    element into a trie and then add the subsequent branch exact
3812                    strings to the trie.
3813
3814                    We have two cases
3815
3816                      1. patterns where the whole set of branches can be
3817                         converted.
3818
3819                      2. patterns where only a subset can be converted.
3820
3821                    In case 1 we can replace the whole set with a single regop
3822                    for the trie. In case 2 we need to keep the start and end
3823                    branches so
3824
3825                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3826                      becomes BRANCH TRIE; BRANCH X;
3827
3828                   There is an additional case, that being where there is a
3829                   common prefix, which gets split out into an EXACT like node
3830                   preceding the TRIE node.
3831
3832                   If x(1..n)==tail then we can do a simple trie, if not we make
3833                   a "jump" trie, such that when we match the appropriate word
3834                   we "jump" to the appropriate tail node. Essentially we turn
3835                   a nested if into a case structure of sorts.
3836
3837                 */
3838
3839                     int made=0;
3840                     if (!re_trie_maxbuff) {
3841                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3842                         if (!SvIOK(re_trie_maxbuff))
3843                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3844                     }
3845                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3846                         regnode *cur;
3847                         regnode *first = (regnode *)NULL;
3848                         regnode *last = (regnode *)NULL;
3849                         regnode *tail = scan;
3850                         U8 trietype = 0;
3851                         U32 count=0;
3852
3853 #ifdef DEBUGGING
3854                         SV * const mysv = sv_newmortal();   /* for dumping */
3855 #endif
3856                         /* var tail is used because there may be a TAIL
3857                            regop in the way. Ie, the exacts will point to the
3858                            thing following the TAIL, but the last branch will
3859                            point at the TAIL. So we advance tail. If we
3860                            have nested (?:) we may have to move through several
3861                            tails.
3862                          */
3863
3864                         while ( OP( tail ) == TAIL ) {
3865                             /* this is the TAIL generated by (?:) */
3866                             tail = regnext( tail );
3867                         }
3868
3869
3870                         DEBUG_TRIE_COMPILE_r({
3871                             regprop(RExC_rx, mysv, tail, NULL);
3872                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3873                               (int)depth * 2 + 2, "",
3874                               "Looking for TRIE'able sequences. Tail node is: ",
3875                               SvPV_nolen_const( mysv )
3876                             );
3877                         });
3878
3879                         /*
3880
3881                             Step through the branches
3882                                 cur represents each branch,
3883                                 noper is the first thing to be matched as part
3884                                       of that branch
3885                                 noper_next is the regnext() of that node.
3886
3887                             We normally handle a case like this
3888                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3889                             support building with NOJUMPTRIE, which restricts
3890                             the trie logic to structures like /FOO|BAR/.
3891
3892                             If noper is a trieable nodetype then the branch is
3893                             a possible optimization target. If we are building
3894                             under NOJUMPTRIE then we require that noper_next is
3895                             the same as scan (our current position in the regex
3896                             program).
3897
3898                             Once we have two or more consecutive such branches
3899                             we can create a trie of the EXACT's contents and
3900                             stitch it in place into the program.
3901
3902                             If the sequence represents all of the branches in
3903                             the alternation we replace the entire thing with a
3904                             single TRIE node.
3905
3906                             Otherwise when it is a subsequence we need to
3907                             stitch it in place and replace only the relevant
3908                             branches. This means the first branch has to remain
3909                             as it is used by the alternation logic, and its
3910                             next pointer, and needs to be repointed at the item
3911                             on the branch chain following the last branch we
3912                             have optimized away.
3913
3914                             This could be either a BRANCH, in which case the
3915                             subsequence is internal, or it could be the item
3916                             following the branch sequence in which case the
3917                             subsequence is at the end (which does not
3918                             necessarily mean the first node is the start of the
3919                             alternation).
3920
3921                             TRIE_TYPE(X) is a define which maps the optype to a
3922                             trietype.
3923
3924                                 optype          |  trietype
3925                                 ----------------+-----------
3926                                 NOTHING         | NOTHING
3927                                 EXACT           | EXACT
3928                                 EXACTFU         | EXACTFU
3929                                 EXACTFU_SS      | EXACTFU
3930                                 EXACTFA         | EXACTFA
3931
3932
3933                         */
3934 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3935                        ( EXACT == (X) )   ? EXACT :        \
3936                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3937                        ( EXACTFA == (X) ) ? EXACTFA :        \
3938                        0 )
3939
3940                         /* dont use tail as the end marker for this traverse */
3941                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3942                             regnode * const noper = NEXTOPER( cur );
3943                             U8 noper_type = OP( noper );
3944                             U8 noper_trietype = TRIE_TYPE( noper_type );
3945 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3946                             regnode * const noper_next = regnext( noper );
3947                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3948                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3949 #endif
3950
3951                             DEBUG_TRIE_COMPILE_r({
3952                                 regprop(RExC_rx, mysv, cur, NULL);
3953                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3954                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3955
3956                                 regprop(RExC_rx, mysv, noper, NULL);
3957                                 PerlIO_printf( Perl_debug_log, " -> %s",
3958                                     SvPV_nolen_const(mysv));
3959
3960                                 if ( noper_next ) {
3961                                   regprop(RExC_rx, mysv, noper_next, NULL);
3962                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3963                                     SvPV_nolen_const(mysv));
3964                                 }
3965                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3966                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3967                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3968                                 );
3969                             });
3970
3971                             /* Is noper a trieable nodetype that can be merged
3972                              * with the current trie (if there is one)? */
3973                             if ( noper_trietype
3974                                   &&
3975                                   (
3976                                         ( noper_trietype == NOTHING)
3977                                         || ( trietype == NOTHING )
3978                                         || ( trietype == noper_trietype )
3979                                   )
3980 #ifdef NOJUMPTRIE
3981                                   && noper_next == tail
3982 #endif
3983                                   && count < U16_MAX)
3984                             {
3985                                 /* Handle mergable triable node Either we are
3986                                  * the first node in a new trieable sequence,
3987                                  * in which case we do some bookkeeping,
3988                                  * otherwise we update the end pointer. */
3989                                 if ( !first ) {
3990                                     first = cur;
3991                                     if ( noper_trietype == NOTHING ) {
3992 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3993                                         regnode * const noper_next = regnext( noper );
3994                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3995                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3996 #endif
3997
3998                                         if ( noper_next_trietype ) {
3999                                             trietype = noper_next_trietype;
4000                                         } else if (noper_next_type)  {
4001                                             /* a NOTHING regop is 1 regop wide.
4002                                              * We need at least two for a trie
4003                                              * so we can't merge this in */
4004                                             first = NULL;
4005                                         }
4006                                     } else {
4007                                         trietype = noper_trietype;
4008                                     }
4009                                 } else {
4010                                     if ( trietype == NOTHING )
4011                                         trietype = noper_trietype;
4012                                     last = cur;
4013                                 }
4014                                 if (first)
4015                                     count++;
4016                             } /* end handle mergable triable node */
4017                             else {
4018                                 /* handle unmergable node -
4019                                  * noper may either be a triable node which can
4020                                  * not be tried together with the current trie,
4021                                  * or a non triable node */
4022                                 if ( last ) {
4023                                     /* If last is set and trietype is not
4024                                      * NOTHING then we have found at least two
4025                                      * triable branch sequences in a row of a
4026                                      * similar trietype so we can turn them
4027                                      * into a trie. If/when we allow NOTHING to
4028                                      * start a trie sequence this condition
4029                                      * will be required, and it isn't expensive
4030                                      * so we leave it in for now. */
4031                                     if ( trietype && trietype != NOTHING )
4032                                         make_trie( pRExC_state,
4033                                                 startbranch, first, cur, tail,
4034                                                 count, trietype, depth+1 );
4035                                     last = NULL; /* note: we clear/update
4036                                                     first, trietype etc below,
4037                                                     so we dont do it here */
4038                                 }
4039                                 if ( noper_trietype
4040 #ifdef NOJUMPTRIE
4041                                      && noper_next == tail
4042 #endif
4043                                 ){
4044                                     /* noper is triable, so we can start a new
4045                                      * trie sequence */
4046                                     count = 1;
4047                                     first = cur;
4048                                     trietype = noper_trietype;
4049                                 } else if (first) {
4050                                     /* if we already saw a first but the
4051                                      * current node is not triable then we have
4052                                      * to reset the first information. */
4053                                     count = 0;
4054                                     first = NULL;
4055                                     trietype = 0;
4056                                 }
4057                             } /* end handle unmergable node */
4058                         } /* loop over branches */
4059                         DEBUG_TRIE_COMPILE_r({
4060                             regprop(RExC_rx, mysv, cur, NULL);
4061                             PerlIO_printf( Perl_debug_log,
4062                               "%*s- %s (%d) <SCAN FINISHED>\n",
4063                               (int)depth * 2 + 2,
4064                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4065
4066                         });
4067                         if ( last && trietype ) {
4068                             if ( trietype != NOTHING ) {
4069                                 /* the last branch of the sequence was part of
4070                                  * a trie, so we have to construct it here
4071                                  * outside of the loop */
4072                                 made= make_trie( pRExC_state, startbranch,
4073                                                  first, scan, tail, count,
4074                                                  trietype, depth+1 );
4075 #ifdef TRIE_STUDY_OPT
4076                                 if ( ((made == MADE_EXACT_TRIE &&
4077                                      startbranch == first)
4078                                      || ( first_non_open == first )) &&
4079                                      depth==0 ) {
4080                                     flags |= SCF_TRIE_RESTUDY;
4081                                     if ( startbranch == first
4082                                          && scan == tail )
4083                                     {
4084                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4085                                     }
4086                                 }
4087 #endif
4088                             } else {
4089                                 /* at this point we know whatever we have is a
4090                                  * NOTHING sequence/branch AND if 'startbranch'
4091                                  * is 'first' then we can turn the whole thing
4092                                  * into a NOTHING
4093                                  */
4094                                 if ( startbranch == first ) {
4095                                     regnode *opt;
4096                                     /* the entire thing is a NOTHING sequence,
4097                                      * something like this: (?:|) So we can
4098                                      * turn it into a plain NOTHING op. */
4099                                     DEBUG_TRIE_COMPILE_r({
4100                                         regprop(RExC_rx, mysv, cur, NULL);
4101                                         PerlIO_printf( Perl_debug_log,
4102                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4103                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4104
4105                                     });
4106                                     OP(startbranch)= NOTHING;
4107                                     NEXT_OFF(startbranch)= tail - startbranch;
4108                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4109                                         OP(opt)= OPTIMIZED;
4110                                 }
4111                             }
4112                         } /* end if ( last) */
4113                     } /* TRIE_MAXBUF is non zero */
4114
4115                 } /* do trie */
4116
4117             }
4118             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4119                 scan = NEXTOPER(NEXTOPER(scan));
4120             } else                      /* single branch is optimized. */
4121                 scan = NEXTOPER(scan);
4122             continue;
4123         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4124             scan_frame *newframe = NULL;
4125             I32 paren;
4126             regnode *start;
4127             regnode *end;
4128             U32 my_recursed_depth= recursed_depth;
4129
4130             if (OP(scan) != SUSPEND) {
4131                 /* set the pointer */
4132                 if (OP(scan) == GOSUB) {
4133                     paren = ARG(scan);
4134                     RExC_recurse[ARG2L(scan)] = scan;
4135                     start = RExC_open_parens[paren-1];
4136                     end   = RExC_close_parens[paren-1];
4137                 } else {
4138                     paren = 0;
4139                     start = RExC_rxi->program + 1;
4140                     end   = RExC_opend;
4141                 }
4142                 if (!recursed_depth
4143                     ||
4144                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4145                 ) {
4146                     if (!recursed_depth) {
4147                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4148                     } else {
4149                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4150                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4151                              RExC_study_chunk_recursed_bytes, U8);
4152                     }
4153                     /* we havent recursed into this paren yet, so recurse into it */
4154                     DEBUG_STUDYDATA("set:", data,depth);
4155                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4156                     my_recursed_depth= recursed_depth + 1;
4157                     Newx(newframe,1,scan_frame);
4158                 } else {
4159                     DEBUG_STUDYDATA("inf:", data,depth);
4160                     /* some form of infinite recursion, assume infinite length
4161                      * */
4162                     if (flags & SCF_DO_SUBSTR) {
4163                         scan_commit(pRExC_state, data, minlenp, is_inf);
4164                         data->longest = &(data->longest_float);
4165                     }
4166                     is_inf = is_inf_internal = 1;
4167                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4168                         ssc_anything(data->start_class);
4169                     flags &= ~SCF_DO_STCLASS;
4170                 }
4171             } else {
4172                 Newx(newframe,1,scan_frame);
4173                 paren = stopparen;
4174                 start = scan+2;
4175                 end = regnext(scan);
4176             }
4177             if (newframe) {
4178                 assert(start);
4179                 assert(end);
4180                 SAVEFREEPV(newframe);
4181                 newframe->next = regnext(scan);
4182                 newframe->last = last;
4183                 newframe->stop = stopparen;
4184                 newframe->prev = frame;
4185                 newframe->prev_recursed_depth = recursed_depth;
4186
4187                 DEBUG_STUDYDATA("frame-new:",data,depth);
4188                 DEBUG_PEEP("fnew", scan, depth);
4189
4190                 frame = newframe;
4191                 scan =  start;
4192                 stopparen = paren;
4193                 last = end;
4194                 depth = depth + 1;
4195                 recursed_depth= my_recursed_depth;
4196
4197                 continue;
4198             }
4199         }
4200         else if (OP(scan) == EXACT) {
4201             SSize_t l = STR_LEN(scan);
4202             UV uc;
4203             if (UTF) {
4204                 const U8 * const s = (U8*)STRING(scan);
4205                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4206                 l = utf8_length(s, s + l);
4207             } else {
4208                 uc = *((U8*)STRING(scan));
4209             }
4210             min += l;
4211             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4212                 /* The code below prefers earlier match for fixed
4213                    offset, later match for variable offset.  */
4214                 if (data->last_end == -1) { /* Update the start info. */
4215                     data->last_start_min = data->pos_min;
4216                     data->last_start_max = is_inf
4217                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4218                 }
4219                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4220                 if (UTF)
4221                     SvUTF8_on(data->last_found);
4222                 {
4223                     SV * const sv = data->last_found;
4224                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4225                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4226                     if (mg && mg->mg_len >= 0)
4227                         mg->mg_len += utf8_length((U8*)STRING(scan),
4228                                               (U8*)STRING(scan)+STR_LEN(scan));
4229                 }
4230                 data->last_end = data->pos_min + l;
4231                 data->pos_min += l; /* As in the first entry. */
4232                 data->flags &= ~SF_BEFORE_EOL;
4233             }
4234
4235             /* ANDing the code point leaves at most it, and not in locale, and
4236              * can't match null string */
4237             if (flags & SCF_DO_STCLASS_AND) {
4238                 ssc_cp_and(data->start_class, uc);
4239                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4240                 ssc_clear_locale(data->start_class);
4241             }
4242             else if (flags & SCF_DO_STCLASS_OR) {
4243                 ssc_add_cp(data->start_class, uc);
4244                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4245
4246                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4247                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4248             }
4249             flags &= ~SCF_DO_STCLASS;
4250         }
4251         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4252             SSize_t l = STR_LEN(scan);
4253             UV uc = *((U8*)STRING(scan));
4254             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4255                                                      separate code points */
4256
4257             /* Search for fixed substrings supports EXACT only. */
4258             if (flags & SCF_DO_SUBSTR) {
4259                 assert(data);
4260                 scan_commit(pRExC_state, data, minlenp, is_inf);
4261             }
4262             if (UTF) {
4263                 const U8 * const s = (U8 *)STRING(scan);
4264                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4265                 l = utf8_length(s, s + l);
4266             }
4267             if (unfolded_multi_char) {
4268                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4269             }
4270             min += l - min_subtract;
4271             assert (min >= 0);
4272             delta += min_subtract;
4273             if (flags & SCF_DO_SUBSTR) {
4274                 data->pos_min += l - min_subtract;
4275                 if (data->pos_min < 0) {
4276                     data->pos_min = 0;
4277                 }
4278                 data->pos_delta += min_subtract;
4279                 if (min_subtract) {
4280                     data->longest = &(data->longest_float);
4281                 }
4282             }
4283             if (OP(scan) == EXACTFL) {
4284
4285                 /* We don't know what the folds are; it could be anything. XXX
4286                  * Actually, we only support UTF-8 encoding for code points
4287                  * above Latin1, so we could know what those folds are. */
4288                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4289                                                        0,
4290                                                        UV_MAX);
4291             }
4292             else {  /* Non-locale EXACTFish */
4293                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4294                 if (flags & SCF_DO_STCLASS_AND) {
4295                     ssc_clear_locale(data->start_class);
4296                 }
4297                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4298                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4299                                                        know if anything folds
4300                                                        with this */
4301                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4302                                                            PL_fold_latin1[uc]);
4303                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4304                                                       legal under /iaa */
4305                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4306                                 EXACTF_invlist
4307                                     = add_cp_to_invlist(EXACTF_invlist,
4308                                                 LATIN_SMALL_LETTER_SHARP_S);
4309                             }
4310                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4311                                 EXACTF_invlist
4312                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4313                                 EXACTF_invlist
4314                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4315                             }
4316                         }
4317
4318                         /* We also know if there are above-Latin1 code points
4319                          * that fold to this (none legal for ASCII and /iaa) */
4320                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4321                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4322                         {
4323                             /* XXX We could know exactly what does fold to this
4324                              * if the reverse folds are loaded, as currently in
4325                              * S_regclass() */
4326                             _invlist_union(EXACTF_invlist,
4327                                            PL_AboveLatin1,
4328                                            &EXACTF_invlist);
4329                         }
4330                     }
4331                 }
4332                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4333                            know what participates in folds with this, so have
4334                            to assume anything could */
4335
4336                     /* XXX We could know exactly what does fold to this if the
4337                      * reverse folds are loaded, as currently in S_regclass().
4338                      * But we do know that under /iaa nothing in the ASCII
4339                      * range can participate */
4340                     if (OP(scan) == EXACTFA) {
4341                         _invlist_union_complement_2nd(EXACTF_invlist,
4342                                                       PL_XPosix_ptrs[_CC_ASCII],
4343                                                       &EXACTF_invlist);
4344                     }
4345                     else {
4346                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4347                                                                0, UV_MAX);
4348                     }
4349                 }
4350             }
4351             if (flags & SCF_DO_STCLASS_AND) {
4352                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4353                 ANYOF_POSIXL_ZERO(data->start_class);
4354                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4355             }
4356             else if (flags & SCF_DO_STCLASS_OR) {
4357                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4358                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4359
4360                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4361                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4362             }
4363             flags &= ~SCF_DO_STCLASS;
4364             SvREFCNT_dec(EXACTF_invlist);
4365         }
4366         else if (REGNODE_VARIES(OP(scan))) {
4367             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4368             I32 fl = 0, f = flags;
4369             regnode * const oscan = scan;
4370             regnode_ssc this_class;
4371             regnode_ssc *oclass = NULL;
4372             I32 next_is_eval = 0;
4373
4374             switch (PL_regkind[OP(scan)]) {
4375             case WHILEM:                /* End of (?:...)* . */
4376                 scan = NEXTOPER(scan);
4377                 goto finish;
4378             case PLUS:
4379                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4380                     next = NEXTOPER(scan);
4381                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4382                         mincount = 1;
4383                         maxcount = REG_INFTY;
4384                         next = regnext(scan);
4385                         scan = NEXTOPER(scan);
4386                         goto do_curly;
4387                     }
4388                 }
4389                 if (flags & SCF_DO_SUBSTR)
4390                     data->pos_min++;
4391                 min++;
4392                 /* Fall through. */
4393             case STAR:
4394                 if (flags & SCF_DO_STCLASS) {
4395                     mincount = 0;
4396                     maxcount = REG_INFTY;
4397                     next = regnext(scan);
4398                     scan = NEXTOPER(scan);
4399                     goto do_curly;
4400                 }
4401                 if (flags & SCF_DO_SUBSTR) {
4402                     scan_commit(pRExC_state, data, minlenp, is_inf);
4403                     /* Cannot extend fixed substrings */
4404                     data->longest = &(data->longest_float);
4405                 }
4406                 is_inf = is_inf_internal = 1;
4407                 scan = regnext(scan);
4408                 goto optimize_curly_tail;
4409             case CURLY:
4410                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4411                     && (scan->flags == stopparen))
4412                 {
4413                     mincount = 1;
4414                     maxcount = 1;
4415                 } else {
4416                     mincount = ARG1(scan);
4417                     maxcount = ARG2(scan);
4418                 }
4419                 next = regnext(scan);
4420                 if (OP(scan) == CURLYX) {
4421                     I32 lp = (data ? *(data->last_closep) : 0);
4422                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4423                 }
4424                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4425                 next_is_eval = (OP(scan) == EVAL);
4426               do_curly:
4427                 if (flags & SCF_DO_SUBSTR) {
4428                     if (mincount == 0)
4429                         scan_commit(pRExC_state, data, minlenp, is_inf);
4430                     /* Cannot extend fixed substrings */
4431                     pos_before = data->pos_min;
4432                 }
4433                 if (data) {
4434                     fl = data->flags;
4435                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4436                     if (is_inf)
4437                         data->flags |= SF_IS_INF;
4438                 }
4439                 if (flags & SCF_DO_STCLASS) {
4440                     ssc_init(pRExC_state, &this_class);
4441                     oclass = data->start_class;
4442                     data->start_class = &this_class;
4443                     f |= SCF_DO_STCLASS_AND;
4444                     f &= ~SCF_DO_STCLASS_OR;
4445                 }
4446                 /* Exclude from super-linear cache processing any {n,m}
4447                    regops for which the combination of input pos and regex
4448                    pos is not enough information to determine if a match
4449                    will be possible.
4450
4451                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4452                    regex pos at the \s*, the prospects for a match depend not
4453                    only on the input position but also on how many (bar\s*)
4454                    repeats into the {4,8} we are. */
4455                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4456                     f &= ~SCF_WHILEM_VISITED_POS;
4457
4458                 /* This will finish on WHILEM, setting scan, or on NULL: */
4459                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4460                                   last, data, stopparen, recursed_depth, NULL,
4461                                   (mincount == 0
4462                                    ? (f & ~SCF_DO_SUBSTR)
4463                                    : f)
4464                                   ,depth+1);
4465
4466                 if (flags & SCF_DO_STCLASS)
4467                     data->start_class = oclass;
4468                 if (mincount == 0 || minnext == 0) {
4469                     if (flags & SCF_DO_STCLASS_OR) {
4470                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4471                     }
4472                     else if (flags & SCF_DO_STCLASS_AND) {
4473                         /* Switch to OR mode: cache the old value of
4474                          * data->start_class */
4475                         INIT_AND_WITHP;
4476                         StructCopy(data->start_class, and_withp, regnode_ssc);
4477                         flags &= ~SCF_DO_STCLASS_AND;
4478                         StructCopy(&this_class, data->start_class, regnode_ssc);
4479                         flags |= SCF_DO_STCLASS_OR;
4480                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4481                     }
4482                 } else {                /* Non-zero len */
4483                     if (flags & SCF_DO_STCLASS_OR) {
4484                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4485                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4486                     }
4487                     else if (flags & SCF_DO_STCLASS_AND)
4488                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4489                     flags &= ~SCF_DO_STCLASS;
4490                 }
4491                 if (!scan)              /* It was not CURLYX, but CURLY. */
4492                     scan = next;
4493                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4494                     /* ? quantifier ok, except for (?{ ... }) */
4495                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4496                     && (minnext == 0) && (deltanext == 0)
4497                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4498                     && maxcount <= REG_INFTY/3) /* Complement check for big
4499                                                    count */
4500                 {
4501                     /* Fatal warnings may leak the regexp without this: */
4502                     SAVEFREESV(RExC_rx_sv);
4503                     ckWARNreg(RExC_parse,
4504                             "Quantifier unexpected on zero-length expression");
4505                     (void)ReREFCNT_inc(RExC_rx_sv);
4506                 }
4507
4508                 min += minnext * mincount;
4509                 is_inf_internal |= deltanext == SSize_t_MAX
4510                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4511                 is_inf |= is_inf_internal;
4512                 if (is_inf) {
4513                     delta = SSize_t_MAX;
4514                 } else {
4515                     delta += (minnext + deltanext) * maxcount
4516                              - minnext * mincount;
4517                 }
4518                 /* Try powerful optimization CURLYX => CURLYN. */
4519                 if (  OP(oscan) == CURLYX && data
4520                       && data->flags & SF_IN_PAR
4521                       && !(data->flags & SF_HAS_EVAL)
4522                       && !deltanext && minnext == 1 ) {
4523                     /* Try to optimize to CURLYN.  */
4524                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4525                     regnode * const nxt1 = nxt;
4526 #ifdef DEBUGGING
4527                     regnode *nxt2;
4528 #endif
4529
4530                     /* Skip open. */
4531                     nxt = regnext(nxt);
4532                     if (!REGNODE_SIMPLE(OP(nxt))
4533                         && !(PL_regkind[OP(nxt)] == EXACT
4534                              && STR_LEN(nxt) == 1))
4535                         goto nogo;
4536 #ifdef DEBUGGING
4537                     nxt2 = nxt;
4538 #endif
4539                     nxt = regnext(nxt);
4540                     if (OP(nxt) != CLOSE)
4541                         goto nogo;
4542                     if (RExC_open_parens) {
4543                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4544                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4545                     }
4546                     /* Now we know that nxt2 is the only contents: */
4547                     oscan->flags = (U8)ARG(nxt);
4548                     OP(oscan) = CURLYN;
4549                     OP(nxt1) = NOTHING; /* was OPEN. */
4550
4551 #ifdef DEBUGGING
4552                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4553                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4554                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4555                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4556                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4557                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4558 #endif
4559                 }
4560               nogo:
4561
4562                 /* Try optimization CURLYX => CURLYM. */
4563                 if (  OP(oscan) == CURLYX && data
4564                       && !(data->flags & SF_HAS_PAR)
4565                       && !(data->flags & SF_HAS_EVAL)
4566                       && !deltanext     /* atom is fixed width */
4567                       && minnext != 0   /* CURLYM can't handle zero width */
4568
4569                          /* Nor characters whose fold at run-time may be
4570                           * multi-character */
4571                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4572                 ) {
4573                     /* XXXX How to optimize if data == 0? */
4574                     /* Optimize to a simpler form.  */
4575                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4576                     regnode *nxt2;
4577
4578                     OP(oscan) = CURLYM;
4579                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4580                             && (OP(nxt2) != WHILEM))
4581                         nxt = nxt2;
4582                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4583                     /* Need to optimize away parenths. */
4584                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4585                         /* Set the parenth number.  */
4586                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4587
4588                         oscan->flags = (U8)ARG(nxt);
4589                         if (RExC_open_parens) {
4590                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4591                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4592                         }
4593                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4594                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4595
4596 #ifdef DEBUGGING
4597                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4598                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4599                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4600                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4601 #endif
4602 #if 0
4603                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4604                             regnode *nnxt = regnext(nxt1);
4605                             if (nnxt == nxt) {
4606                                 if (reg_off_by_arg[OP(nxt1)])
4607                                     ARG_SET(nxt1, nxt2 - nxt1);
4608                                 else if (nxt2 - nxt1 < U16_MAX)
4609                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4610                                 else
4611                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4612                             }
4613                             nxt1 = nnxt;
4614                         }
4615 #endif
4616                         /* Optimize again: */
4617                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4618                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4619                     }
4620                     else
4621                         oscan->flags = 0;
4622                 }
4623                 else if ((OP(oscan) == CURLYX)
4624                          && (flags & SCF_WHILEM_VISITED_POS)
4625                          /* See the comment on a similar expression above.
4626                             However, this time it's not a subexpression
4627                             we care about, but the expression itself. */
4628                          && (maxcount == REG_INFTY)
4629                          && data && ++data->whilem_c < 16) {
4630                     /* This stays as CURLYX, we can put the count/of pair. */
4631                     /* Find WHILEM (as in regexec.c) */
4632                     regnode *nxt = oscan + NEXT_OFF(oscan);
4633
4634                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4635                         nxt += ARG(nxt);
4636                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4637                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4638                 }
4639                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4640                     pars++;
4641                 if (flags & SCF_DO_SUBSTR) {
4642                     SV *last_str = NULL;
4643                     STRLEN last_chrs = 0;
4644                     int counted = mincount != 0;
4645
4646                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4647                                                                   string. */
4648                         SSize_t b = pos_before >= data->last_start_min
4649                             ? pos_before : data->last_start_min;
4650                         STRLEN l;
4651                         const char * const s = SvPV_const(data->last_found, l);
4652                         SSize_t old = b - data->last_start_min;
4653
4654                         if (UTF)
4655                             old = utf8_hop((U8*)s, old) - (U8*)s;
4656                         l -= old;
4657                         /* Get the added string: */
4658                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4659                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4660                                             (U8*)(s + old + l)) : l;
4661                         if (deltanext == 0 && pos_before == b) {
4662                             /* What was added is a constant string */
4663                             if (mincount > 1) {
4664
4665                                 SvGROW(last_str, (mincount * l) + 1);
4666                                 repeatcpy(SvPVX(last_str) + l,
4667                                           SvPVX_const(last_str), l,
4668                                           mincount - 1);
4669                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4670                                 /* Add additional parts. */
4671                                 SvCUR_set(data->last_found,
4672                                           SvCUR(data->last_found) - l);
4673                                 sv_catsv(data->last_found, last_str);
4674                                 {
4675                                     SV * sv = data->last_found;
4676                                     MAGIC *mg =
4677                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4678                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4679                                     if (mg && mg->mg_len >= 0)
4680                                         mg->mg_len += last_chrs * (mincount-1);
4681                                 }
4682                                 last_chrs *= mincount;
4683                                 data->last_end += l * (mincount - 1);
4684                             }
4685                         } else {
4686                             /* start offset must point into the last copy */
4687                             data->last_start_min += minnext * (mincount - 1);
4688                             data->last_start_max += is_inf ? SSize_t_MAX
4689                                 : (maxcount - 1) * (minnext + data->pos_delta);
4690                         }
4691                     }
4692                     /* It is counted once already... */
4693                     data->pos_min += minnext * (mincount - counted);
4694 #if 0
4695 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4696                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4697                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4698     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4699     (UV)mincount);
4700 if (deltanext != SSize_t_MAX)
4701 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4702     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4703           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4704 #endif
4705                     if (deltanext == SSize_t_MAX
4706                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4707                         data->pos_delta = SSize_t_MAX;
4708                     else
4709                         data->pos_delta += - counted * deltanext +
4710                         (minnext + deltanext) * maxcount - minnext * mincount;
4711                     if (mincount != maxcount) {
4712                          /* Cannot extend fixed substrings found inside
4713                             the group.  */
4714                         scan_commit(pRExC_state, data, minlenp, is_inf);
4715                         if (mincount && last_str) {
4716                             SV * const sv = data->last_found;
4717                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4718                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4719
4720                             if (mg)
4721                                 mg->mg_len = -1;
4722                             sv_setsv(sv, last_str);
4723                             data->last_end = data->pos_min;
4724                             data->last_start_min = data->pos_min - last_chrs;
4725                             data->last_start_max = is_inf
4726                                 ? SSize_t_MAX
4727                                 : data->pos_min + data->pos_delta - last_chrs;
4728                         }
4729                         data->longest = &(data->longest_float);
4730                     }
4731                     SvREFCNT_dec(last_str);
4732                 }
4733                 if (data && (fl & SF_HAS_EVAL))
4734                     data->flags |= SF_HAS_EVAL;
4735               optimize_curly_tail:
4736                 if (OP(oscan) != CURLYX) {
4737                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4738                            && NEXT_OFF(next))
4739                         NEXT_OFF(oscan) += NEXT_OFF(next);
4740                 }
4741                 continue;
4742
4743             default:
4744 #ifdef DEBUGGING
4745                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4746                                                                     OP(scan));
4747 #endif
4748             case REF:
4749             case CLUMP:
4750                 if (flags & SCF_DO_SUBSTR) {
4751                     /* Cannot expect anything... */
4752                     scan_commit(pRExC_state, data, minlenp, is_inf);
4753                     data->longest = &(data->longest_float);
4754                 }
4755                 is_inf = is_inf_internal = 1;
4756                 if (flags & SCF_DO_STCLASS_OR) {
4757                     if (OP(scan) == CLUMP) {
4758                         /* Actually is any start char, but very few code points
4759                          * aren't start characters */
4760                         ssc_match_all_cp(data->start_class);
4761                     }
4762                     else {
4763                         ssc_anything(data->start_class);
4764                     }
4765                 }
4766                 flags &= ~SCF_DO_STCLASS;
4767                 break;
4768             }
4769         }
4770         else if (OP(scan) == LNBREAK) {
4771             if (flags & SCF_DO_STCLASS) {
4772                 if (flags & SCF_DO_STCLASS_AND) {
4773                     ssc_intersection(data->start_class,
4774                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4775                     ssc_clear_locale(data->start_class);
4776                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4777                 }
4778                 else if (flags & SCF_DO_STCLASS_OR) {
4779                     ssc_union(data->start_class,
4780                               PL_XPosix_ptrs[_CC_VERTSPACE],
4781                               FALSE);
4782                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4783
4784                     /* See commit msg for
4785                      * 749e076fceedeb708a624933726e7989f2302f6a */
4786                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4787                 }
4788                 flags &= ~SCF_DO_STCLASS;
4789             }
4790             min++;
4791             delta++;    /* Because of the 2 char string cr-lf */
4792             if (flags & SCF_DO_SUBSTR) {
4793                 /* Cannot expect anything... */
4794                 scan_commit(pRExC_state, data, minlenp, is_inf);
4795                 data->pos_min += 1;
4796                 data->pos_delta += 1;
4797                 data->longest = &(data->longest_float);
4798             }
4799         }
4800         else if (REGNODE_SIMPLE(OP(scan))) {
4801
4802             if (flags & SCF_DO_SUBSTR) {
4803                 scan_commit(pRExC_state, data, minlenp, is_inf);
4804                 data->pos_min++;
4805             }
4806             min++;
4807             if (flags & SCF_DO_STCLASS) {
4808                 bool invert = 0;
4809                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4810                 U8 namedclass;
4811
4812                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4813                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4814
4815                 /* Some of the logic below assumes that switching
4816                    locale on will only add false positives. */
4817                 switch (OP(scan)) {
4818
4819                 default:
4820 #ifdef DEBUGGING
4821                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4822                                                                      OP(scan));
4823 #endif
4824                 case CANY:
4825                 case SANY:
4826                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4827                         ssc_match_all_cp(data->start_class);
4828                     break;
4829
4830                 case REG_ANY:
4831                     {
4832                         SV* REG_ANY_invlist = _new_invlist(2);
4833                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4834                                                             '\n');
4835                         if (flags & SCF_DO_STCLASS_OR) {
4836                             ssc_union(data->start_class,
4837                                       REG_ANY_invlist,
4838                                       TRUE /* TRUE => invert, hence all but \n
4839                                             */
4840                                       );
4841                         }
4842                         else if (flags & SCF_DO_STCLASS_AND) {
4843                             ssc_intersection(data->start_class,
4844                                              REG_ANY_invlist,
4845                                              TRUE  /* TRUE => invert */
4846                                              );
4847                             ssc_clear_locale(data->start_class);
4848                         }
4849                         SvREFCNT_dec_NN(REG_ANY_invlist);
4850                     }
4851                     break;
4852
4853                 case ANYOF:
4854                     if (flags & SCF_DO_STCLASS_AND)
4855                         ssc_and(pRExC_state, data->start_class,
4856                                 (regnode_charclass *) scan);
4857                     else
4858                         ssc_or(pRExC_state, data->start_class,
4859                                                           (regnode_charclass *) scan);
4860                     break;
4861
4862                 case NPOSIXL:
4863                     invert = 1;
4864                     /* FALL THROUGH */
4865
4866                 case POSIXL:
4867                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4868                     if (flags & SCF_DO_STCLASS_AND) {
4869                         bool was_there = cBOOL(
4870                                           ANYOF_POSIXL_TEST(data->start_class,
4871                                                                  namedclass));
4872                         ANYOF_POSIXL_ZERO(data->start_class);
4873                         if (was_there) {    /* Do an AND */
4874                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4875                         }
4876                         /* No individual code points can now match */
4877                         data->start_class->invlist
4878                                                 = sv_2mortal(_new_invlist(0));
4879                     }
4880                     else {
4881                         int complement = namedclass + ((invert) ? -1 : 1);
4882
4883                         assert(flags & SCF_DO_STCLASS_OR);
4884
4885                         /* If the complement of this class was already there,
4886                          * the result is that they match all code points,
4887                          * (\d + \D == everything).  Remove the classes from
4888                          * future consideration.  Locale is not relevant in
4889                          * this case */
4890                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4891                             ssc_match_all_cp(data->start_class);
4892                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4893                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4894                         }
4895                         else {  /* The usual case; just add this class to the
4896                                    existing set */
4897                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4898                         }
4899                     }
4900                     break;
4901
4902                 case NPOSIXA:   /* For these, we always know the exact set of
4903                                    what's matched */
4904                     invert = 1;
4905                     /* FALL THROUGH */
4906                 case POSIXA:
4907                     if (FLAGS(scan) == _CC_ASCII) {
4908                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4909                     }
4910                     else {
4911                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4912                                               PL_XPosix_ptrs[_CC_ASCII],
4913                                               &my_invlist);
4914                     }
4915                     goto join_posix;
4916
4917                 case NPOSIXD:
4918                 case NPOSIXU:
4919                     invert = 1;
4920                     /* FALL THROUGH */
4921                 case POSIXD:
4922                 case POSIXU:
4923                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4924
4925                     /* NPOSIXD matches all upper Latin1 code points unless the
4926                      * target string being matched is UTF-8, which is
4927                      * unknowable until match time.  Since we are going to
4928                      * invert, we want to get rid of all of them so that the
4929                      * inversion will match all */
4930                     if (OP(scan) == NPOSIXD) {
4931                         _invlist_subtract(my_invlist, PL_UpperLatin1,
4932                                           &my_invlist);
4933                     }
4934
4935                   join_posix:
4936
4937                     if (flags & SCF_DO_STCLASS_AND) {
4938                         ssc_intersection(data->start_class, my_invlist, invert);
4939                         ssc_clear_locale(data->start_class);
4940                     }
4941                     else {
4942                         assert(flags & SCF_DO_STCLASS_OR);
4943                         ssc_union(data->start_class, my_invlist, invert);
4944                     }
4945                 }
4946                 if (flags & SCF_DO_STCLASS_OR)
4947                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4948                 flags &= ~SCF_DO_STCLASS;
4949             }
4950         }
4951         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4952             data->flags |= (OP(scan) == MEOL
4953                             ? SF_BEFORE_MEOL
4954                             : SF_BEFORE_SEOL);
4955             scan_commit(pRExC_state, data, minlenp, is_inf);
4956
4957         }
4958         else if (  PL_regkind[OP(scan)] == BRANCHJ
4959                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4960                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4961                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4962             if ( OP(scan) == UNLESSM &&
4963                  scan->flags == 0 &&
4964                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4965                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4966             ) {
4967                 regnode *opt;
4968                 regnode *upto= regnext(scan);
4969                 DEBUG_PARSE_r({
4970                     SV * const mysv_val=sv_newmortal();
4971                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4972
4973                     /*DEBUG_PARSE_MSG("opfail");*/
4974                     regprop(RExC_rx, mysv_val, upto, NULL);
4975                     PerlIO_printf(Perl_debug_log,
4976                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4977                         SvPV_nolen_const(mysv_val),
4978                         (IV)REG_NODE_NUM(upto),
4979                         (IV)(upto - scan)
4980                     );
4981                 });
4982                 OP(scan) = OPFAIL;
4983                 NEXT_OFF(scan) = upto - scan;
4984                 for (opt= scan + 1; opt < upto ; opt++)
4985                     OP(opt) = OPTIMIZED;
4986                 scan= upto;
4987                 continue;
4988             }
4989             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4990                 || OP(scan) == UNLESSM )
4991             {
4992                 /* Negative Lookahead/lookbehind
4993                    In this case we can't do fixed string optimisation.
4994                 */
4995
4996                 SSize_t deltanext, minnext, fake = 0;
4997                 regnode *nscan;
4998                 regnode_ssc intrnl;
4999                 int f = 0;
5000
5001                 data_fake.flags = 0;
5002                 if (data) {
5003                     data_fake.whilem_c = data->whilem_c;
5004                     data_fake.last_closep = data->last_closep;
5005                 }
5006                 else
5007                     data_fake.last_closep = &fake;
5008                 data_fake.pos_delta = delta;
5009                 if ( flags & SCF_DO_STCLASS && !scan->flags
5010                      && OP(scan) == IFMATCH ) { /* Lookahead */
5011                     ssc_init(pRExC_state, &intrnl);
5012                     data_fake.start_class = &intrnl;
5013                     f |= SCF_DO_STCLASS_AND;
5014                 }
5015                 if (flags & SCF_WHILEM_VISITED_POS)
5016                     f |= SCF_WHILEM_VISITED_POS;
5017                 next = regnext(scan);
5018                 nscan = NEXTOPER(NEXTOPER(scan));
5019                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5020                                       last, &data_fake, stopparen,
5021                                       recursed_depth, NULL, f, depth+1);
5022                 if (scan->flags) {
5023                     if (deltanext) {
5024                         FAIL("Variable length lookbehind not implemented");
5025                     }
5026                     else if (minnext > (I32)U8_MAX) {
5027                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5028                               (UV)U8_MAX);
5029                     }
5030                     scan->flags = (U8)minnext;
5031                 }
5032                 if (data) {
5033                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5034                         pars++;
5035                     if (data_fake.flags & SF_HAS_EVAL)
5036                         data->flags |= SF_HAS_EVAL;
5037                     data->whilem_c = data_fake.whilem_c;
5038                 }
5039                 if (f & SCF_DO_STCLASS_AND) {
5040                     if (flags & SCF_DO_STCLASS_OR) {
5041                         /* OR before, AND after: ideally we would recurse with
5042                          * data_fake to get the AND applied by study of the
5043                          * remainder of the pattern, and then derecurse;
5044                          * *** HACK *** for now just treat as "no information".
5045                          * See [perl #56690].
5046                          */
5047                         ssc_init(pRExC_state, data->start_class);
5048                     }  else {
5049                         /* AND before and after: combine and continue.  These
5050                          * assertions are zero-length, so can match an EMPTY
5051                          * string */
5052                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5053                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5054                     }
5055                 }
5056             }
5057 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5058             else {
5059                 /* Positive Lookahead/lookbehind
5060                    In this case we can do fixed string optimisation,
5061                    but we must be careful about it. Note in the case of
5062                    lookbehind the positions will be offset by the minimum
5063                    length of the pattern, something we won't know about
5064                    until after the recurse.
5065                 */
5066                 SSize_t deltanext, fake = 0;
5067                 regnode *nscan;
5068                 regnode_ssc intrnl;
5069                 int f = 0;
5070                 /* We use SAVEFREEPV so that when the full compile
5071                     is finished perl will clean up the allocated
5072                     minlens when it's all done. This way we don't
5073                     have to worry about freeing them when we know
5074                     they wont be used, which would be a pain.
5075                  */
5076                 SSize_t *minnextp;
5077                 Newx( minnextp, 1, SSize_t );
5078                 SAVEFREEPV(minnextp);
5079
5080                 if (data) {
5081                     StructCopy(data, &data_fake, scan_data_t);
5082                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5083                         f |= SCF_DO_SUBSTR;
5084                         if (scan->flags)
5085                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5086                         data_fake.last_found=newSVsv(data->last_found);
5087                     }
5088                 }
5089                 else
5090                     data_fake.last_closep = &fake;
5091                 data_fake.flags = 0;
5092                 data_fake.pos_delta = delta;
5093                 if (is_inf)
5094                     data_fake.flags |= SF_IS_INF;
5095                 if ( flags & SCF_DO_STCLASS && !scan->flags
5096                      && OP(scan) == IFMATCH ) { /* Lookahead */
5097                     ssc_init(pRExC_state, &intrnl);
5098                     data_fake.start_class = &intrnl;
5099                     f |= SCF_DO_STCLASS_AND;
5100                 }
5101                 if (flags & SCF_WHILEM_VISITED_POS)
5102                     f |= SCF_WHILEM_VISITED_POS;
5103                 next = regnext(scan);
5104                 nscan = NEXTOPER(NEXTOPER(scan));
5105
5106                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5107                                         &deltanext, last, &data_fake,
5108                                         stopparen, recursed_depth, NULL,
5109                                         f,depth+1);
5110                 if (scan->flags) {
5111                     if (deltanext) {
5112                         FAIL("Variable length lookbehind not implemented");
5113                     }
5114                     else if (*minnextp > (I32)U8_MAX) {
5115                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5116                               (UV)U8_MAX);
5117                     }
5118                     scan->flags = (U8)*minnextp;
5119                 }
5120
5121                 *minnextp += min;
5122
5123                 if (f & SCF_DO_STCLASS_AND) {
5124                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5125                     ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5126                 }
5127                 if (data) {
5128                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5129                         pars++;
5130                     if (data_fake.flags & SF_HAS_EVAL)
5131                         data->flags |= SF_HAS_EVAL;
5132                     data->whilem_c = data_fake.whilem_c;
5133                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5134                         if (RExC_rx->minlen<*minnextp)
5135                             RExC_rx->minlen=*minnextp;
5136                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5137                         SvREFCNT_dec_NN(data_fake.last_found);
5138
5139                         if ( data_fake.minlen_fixed != minlenp )
5140                         {
5141                             data->offset_fixed= data_fake.offset_fixed;
5142                             data->minlen_fixed= data_fake.minlen_fixed;
5143                             data->lookbehind_fixed+= scan->flags;
5144                         }
5145                         if ( data_fake.minlen_float != minlenp )
5146                         {
5147                             data->minlen_float= data_fake.minlen_float;
5148                             data->offset_float_min=data_fake.offset_float_min;
5149                             data->offset_float_max=data_fake.offset_float_max;
5150                             data->lookbehind_float+= scan->flags;
5151                         }
5152                     }
5153                 }
5154             }
5155 #endif
5156         }
5157         else if (OP(scan) == OPEN) {
5158             if (stopparen != (I32)ARG(scan))
5159                 pars++;
5160         }
5161         else if (OP(scan) == CLOSE) {
5162             if (stopparen == (I32)ARG(scan)) {
5163                 break;
5164             }
5165             if ((I32)ARG(scan) == is_par) {
5166                 next = regnext(scan);
5167
5168                 if ( next && (OP(next) != WHILEM) && next < last)
5169                     is_par = 0;         /* Disable optimization */
5170             }
5171             if (data)
5172                 *(data->last_closep) = ARG(scan);
5173         }
5174         else if (OP(scan) == EVAL) {
5175                 if (data)
5176                     data->flags |= SF_HAS_EVAL;
5177         }
5178         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5179             if (flags & SCF_DO_SUBSTR) {
5180                 scan_commit(pRExC_state, data, minlenp, is_inf);
5181                 flags &= ~SCF_DO_SUBSTR;
5182             }
5183             if (data && OP(scan)==ACCEPT) {
5184                 data->flags |= SCF_SEEN_ACCEPT;
5185                 if (stopmin > min)
5186                     stopmin = min;
5187             }
5188         }
5189         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5190         {
5191                 if (flags & SCF_DO_SUBSTR) {
5192                     scan_commit(pRExC_state, data, minlenp, is_inf);
5193                     data->longest = &(data->longest_float);
5194                 }
5195                 is_inf = is_inf_internal = 1;
5196                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5197                     ssc_anything(data->start_class);
5198                 flags &= ~SCF_DO_STCLASS;
5199         }
5200         else if (OP(scan) == GPOS) {
5201             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5202                 !(delta || is_inf || (data && data->pos_delta)))
5203             {
5204                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5205                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5206                 if (RExC_rx->gofs < (STRLEN)min)
5207                     RExC_rx->gofs = min;
5208             } else {
5209                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5210                 RExC_rx->gofs = 0;
5211             }
5212         }
5213 #ifdef TRIE_STUDY_OPT
5214 #ifdef FULL_TRIE_STUDY
5215         else if (PL_regkind[OP(scan)] == TRIE) {
5216             /* NOTE - There is similar code to this block above for handling
5217                BRANCH nodes on the initial study.  If you change stuff here
5218                check there too. */
5219             regnode *trie_node= scan;
5220             regnode *tail= regnext(scan);
5221             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5222             SSize_t max1 = 0, min1 = SSize_t_MAX;
5223             regnode_ssc accum;
5224
5225             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5226                 /* Cannot merge strings after this. */
5227                 scan_commit(pRExC_state, data, minlenp, is_inf);
5228             }
5229             if (flags & SCF_DO_STCLASS)
5230                 ssc_init_zero(pRExC_state, &accum);
5231
5232             if (!trie->jump) {
5233                 min1= trie->minlen;
5234                 max1= trie->maxlen;
5235             } else {
5236                 const regnode *nextbranch= NULL;
5237                 U32 word;
5238
5239                 for ( word=1 ; word <= trie->wordcount ; word++)
5240                 {
5241                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5242                     regnode_ssc this_class;
5243
5244                     data_fake.flags = 0;
5245                     if (data) {
5246                         data_fake.whilem_c = data->whilem_c;
5247                         data_fake.last_closep = data->last_closep;
5248                     }
5249                     else
5250                         data_fake.last_closep = &fake;
5251                     data_fake.pos_delta = delta;
5252                     if (flags & SCF_DO_STCLASS) {
5253                         ssc_init(pRExC_state, &this_class);
5254                         data_fake.start_class = &this_class;
5255                         f = SCF_DO_STCLASS_AND;
5256                     }
5257                     if (flags & SCF_WHILEM_VISITED_POS)
5258                         f |= SCF_WHILEM_VISITED_POS;
5259
5260                     if (trie->jump[word]) {
5261                         if (!nextbranch)
5262                             nextbranch = trie_node + trie->jump[0];
5263                         scan= trie_node + trie->jump[word];
5264                         /* We go from the jump point to the branch that follows
5265                            it. Note this means we need the vestigal unused
5266                            branches even though they arent otherwise used. */
5267                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5268                             &deltanext, (regnode *)nextbranch, &data_fake,
5269                             stopparen, recursed_depth, NULL, f,depth+1);
5270                     }
5271                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5272                         nextbranch= regnext((regnode*)nextbranch);
5273
5274                     if (min1 > (SSize_t)(minnext + trie->minlen))
5275                         min1 = minnext + trie->minlen;
5276                     if (deltanext == SSize_t_MAX) {
5277                         is_inf = is_inf_internal = 1;
5278                         max1 = SSize_t_MAX;
5279                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5280                         max1 = minnext + deltanext + trie->maxlen;
5281
5282                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5283                         pars++;
5284                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5285                         if ( stopmin > min + min1)
5286                             stopmin = min + min1;
5287                         flags &= ~SCF_DO_SUBSTR;
5288                         if (data)
5289                             data->flags |= SCF_SEEN_ACCEPT;
5290                     }
5291                     if (data) {
5292                         if (data_fake.flags & SF_HAS_EVAL)
5293                             data->flags |= SF_HAS_EVAL;
5294                         data->whilem_c = data_fake.whilem_c;
5295                     }
5296                     if (flags & SCF_DO_STCLASS)
5297                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5298                 }
5299             }
5300             if (flags & SCF_DO_SUBSTR) {
5301                 data->pos_min += min1;
5302                 data->pos_delta += max1 - min1;
5303                 if (max1 != min1 || is_inf)
5304                     data->longest = &(data->longest_float);
5305             }
5306             min += min1;
5307             delta += max1 - min1;
5308             if (flags & SCF_DO_STCLASS_OR) {
5309                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5310                 if (min1) {
5311                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5312                     flags &= ~SCF_DO_STCLASS;
5313                 }
5314             }
5315             else if (flags & SCF_DO_STCLASS_AND) {
5316                 if (min1) {
5317                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5318                     flags &= ~SCF_DO_STCLASS;
5319                 }
5320                 else {
5321                     /* Switch to OR mode: cache the old value of
5322                      * data->start_class */
5323                     INIT_AND_WITHP;
5324                     StructCopy(data->start_class, and_withp, regnode_ssc);
5325                     flags &= ~SCF_DO_STCLASS_AND;
5326                     StructCopy(&accum, data->start_class, regnode_ssc);
5327                     flags |= SCF_DO_STCLASS_OR;
5328                 }
5329             }
5330             scan= tail;
5331             continue;
5332         }
5333 #else
5334         else if (PL_regkind[OP(scan)] == TRIE) {
5335             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5336             U8*bang=NULL;
5337
5338             min += trie->minlen;
5339             delta += (trie->maxlen - trie->minlen);
5340             flags &= ~SCF_DO_STCLASS; /* xxx */
5341             if (flags & SCF_DO_SUBSTR) {
5342                 /* Cannot expect anything... */
5343                 scan_commit(pRExC_state, data, minlenp, is_inf);
5344                 data->pos_min += trie->minlen;
5345                 data->pos_delta += (trie->maxlen - trie->minlen);
5346                 if (trie->maxlen != trie->minlen)
5347                     data->longest = &(data->longest_float);
5348             }
5349             if (trie->jump) /* no more substrings -- for now /grr*/
5350                flags &= ~SCF_DO_SUBSTR;
5351         }
5352 #endif /* old or new */
5353 #endif /* TRIE_STUDY_OPT */
5354
5355         /* Else: zero-length, ignore. */
5356         scan = regnext(scan);
5357     }
5358     /* If we are exiting a recursion we can unset its recursed bit
5359      * and allow ourselves to enter it again - no danger of an
5360      * infinite loop there.
5361     if (stopparen > -1 && recursed) {
5362         DEBUG_STUDYDATA("unset:", data,depth);
5363         PAREN_UNSET( recursed, stopparen);
5364     }
5365     */
5366     if (frame) {
5367         DEBUG_STUDYDATA("frame-end:",data,depth);
5368         DEBUG_PEEP("fend", scan, depth);
5369         /* restore previous context */
5370         last = frame->last;
5371         scan = frame->next;
5372         stopparen = frame->stop;
5373         recursed_depth = frame->prev_recursed_depth;
5374         depth = depth - 1;
5375
5376         frame = frame->prev;
5377         goto fake_study_recurse;
5378     }
5379
5380   finish:
5381     assert(!frame);
5382     DEBUG_STUDYDATA("pre-fin:",data,depth);
5383
5384     *scanp = scan;
5385     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5386
5387     if (flags & SCF_DO_SUBSTR && is_inf)
5388         data->pos_delta = SSize_t_MAX - data->pos_min;
5389     if (is_par > (I32)U8_MAX)
5390         is_par = 0;
5391     if (is_par && pars==1 && data) {
5392         data->flags |= SF_IN_PAR;
5393         data->flags &= ~SF_HAS_PAR;
5394     }
5395     else if (pars && data) {
5396         data->flags |= SF_HAS_PAR;
5397         data->flags &= ~SF_IN_PAR;
5398     }
5399     if (flags & SCF_DO_STCLASS_OR)
5400         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5401     if (flags & SCF_TRIE_RESTUDY)
5402         data->flags |=  SCF_TRIE_RESTUDY;
5403
5404     DEBUG_STUDYDATA("post-fin:",data,depth);
5405
5406     {
5407         SSize_t final_minlen= min < stopmin ? min : stopmin;
5408
5409         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5410             RExC_maxlen = final_minlen + delta;
5411         }
5412         return final_minlen;
5413     }
5414     /* not-reached */
5415 }
5416
5417 STATIC U32
5418 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5419 {
5420     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5421
5422     PERL_ARGS_ASSERT_ADD_DATA;
5423
5424     Renewc(RExC_rxi->data,
5425            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5426            char, struct reg_data);
5427     if(count)
5428         Renew(RExC_rxi->data->what, count + n, U8);
5429     else
5430         Newx(RExC_rxi->data->what, n, U8);
5431     RExC_rxi->data->count = count + n;
5432     Copy(s, RExC_rxi->data->what + count, n, U8);
5433     return count;
5434 }
5435
5436 /*XXX: todo make this not included in a non debugging perl */
5437 #ifndef PERL_IN_XSUB_RE
5438 void
5439 Perl_reginitcolors(pTHX)
5440 {
5441     dVAR;
5442     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5443     if (s) {
5444         char *t = savepv(s);
5445         int i = 0;
5446         PL_colors[0] = t;
5447         while (++i < 6) {
5448             t = strchr(t, '\t');
5449             if (t) {
5450                 *t = '\0';
5451                 PL_colors[i] = ++t;
5452             }
5453             else
5454                 PL_colors[i] = t = (char *)"";
5455         }
5456     } else {
5457         int i = 0;
5458         while (i < 6)
5459             PL_colors[i++] = (char *)"";
5460     }
5461     PL_colorset = 1;
5462 }
5463 #endif
5464
5465
5466 #ifdef TRIE_STUDY_OPT
5467 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5468     STMT_START {                                            \
5469         if (                                                \
5470               (data.flags & SCF_TRIE_RESTUDY)               \
5471               && ! restudied++                              \
5472         ) {                                                 \
5473             dOsomething;                                    \
5474             goto reStudy;                                   \
5475         }                                                   \
5476     } STMT_END
5477 #else
5478 #define CHECK_RESTUDY_GOTO_butfirst
5479 #endif
5480
5481 /*
5482  * pregcomp - compile a regular expression into internal code
5483  *
5484  * Decides which engine's compiler to call based on the hint currently in
5485  * scope
5486  */
5487
5488 #ifndef PERL_IN_XSUB_RE
5489
5490 /* return the currently in-scope regex engine (or the default if none)  */
5491
5492 regexp_engine const *
5493 Perl_current_re_engine(pTHX)
5494 {
5495     dVAR;
5496
5497     if (IN_PERL_COMPILETIME) {
5498         HV * const table = GvHV(PL_hintgv);
5499         SV **ptr;
5500
5501         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5502             return &PL_core_reg_engine;
5503         ptr = hv_fetchs(table, "regcomp", FALSE);
5504         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5505             return &PL_core_reg_engine;
5506         return INT2PTR(regexp_engine*,SvIV(*ptr));
5507     }
5508     else {
5509         SV *ptr;
5510         if (!PL_curcop->cop_hints_hash)
5511             return &PL_core_reg_engine;
5512         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5513         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5514             return &PL_core_reg_engine;
5515         return INT2PTR(regexp_engine*,SvIV(ptr));
5516     }
5517 }
5518
5519
5520 REGEXP *
5521 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5522 {
5523     dVAR;
5524     regexp_engine const *eng = current_re_engine();
5525     GET_RE_DEBUG_FLAGS_DECL;
5526
5527     PERL_ARGS_ASSERT_PREGCOMP;
5528
5529     /* Dispatch a request to compile a regexp to correct regexp engine. */
5530     DEBUG_COMPILE_r({
5531         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5532                         PTR2UV(eng));
5533     });
5534     return CALLREGCOMP_ENG(eng, pattern, flags);
5535 }
5536 #endif
5537
5538 /* public(ish) entry point for the perl core's own regex compiling code.
5539  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5540  * pattern rather than a list of OPs, and uses the internal engine rather
5541  * than the current one */
5542
5543 REGEXP *
5544 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5545 {
5546     SV *pat = pattern; /* defeat constness! */
5547     PERL_ARGS_ASSERT_RE_COMPILE;
5548     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5549 #ifdef PERL_IN_XSUB_RE
5550                                 &my_reg_engine,
5551 #else
5552                                 &PL_core_reg_engine,
5553 #endif
5554                                 NULL, NULL, rx_flags, 0);
5555 }
5556
5557
5558 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5559  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5560  * point to the realloced string and length.
5561  *
5562  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5563  * stuff added */
5564
5565 static void
5566 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5567                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5568 {
5569     U8 *const src = (U8*)*pat_p;
5570     U8 *dst;
5571     int n=0;
5572     STRLEN s = 0, d = 0;
5573     bool do_end = 0;
5574     GET_RE_DEBUG_FLAGS_DECL;
5575
5576     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5577         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5578
5579     Newx(dst, *plen_p * 2 + 1, U8);
5580
5581     while (s < *plen_p) {
5582         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5583             dst[d]   = src[s];
5584         else {
5585             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5586             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5587         }
5588         if (n < num_code_blocks) {
5589             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5590                 pRExC_state->code_blocks[n].start = d;
5591                 assert(dst[d] == '(');
5592                 do_end = 1;
5593             }
5594             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5595                 pRExC_state->code_blocks[n].end = d;
5596                 assert(dst[d] == ')');
5597                 do_end = 0;
5598                 n++;
5599             }
5600         }
5601         s++;
5602         d++;
5603     }
5604     dst[d] = '\0';
5605     *plen_p = d;
5606     *pat_p = (char*) dst;
5607     SAVEFREEPV(*pat_p);
5608     RExC_orig_utf8 = RExC_utf8 = 1;
5609 }
5610
5611
5612
5613 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5614  * while recording any code block indices, and handling overloading,
5615  * nested qr// objects etc.  If pat is null, it will allocate a new
5616  * string, or just return the first arg, if there's only one.
5617  *
5618  * Returns the malloced/updated pat.
5619  * patternp and pat_count is the array of SVs to be concatted;
5620  * oplist is the optional list of ops that generated the SVs;
5621  * recompile_p is a pointer to a boolean that will be set if
5622  *   the regex will need to be recompiled.
5623  * delim, if non-null is an SV that will be inserted between each element
5624  */
5625
5626 static SV*
5627 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5628                 SV *pat, SV ** const patternp, int pat_count,
5629                 OP *oplist, bool *recompile_p, SV *delim)
5630 {
5631     SV **svp;
5632     int n = 0;
5633     bool use_delim = FALSE;
5634     bool alloced = FALSE;
5635
5636     /* if we know we have at least two args, create an empty string,
5637      * then concatenate args to that. For no args, return an empty string */
5638     if (!pat && pat_count != 1) {
5639         pat = newSVpvn("", 0);
5640         SAVEFREESV(pat);
5641         alloced = TRUE;
5642     }
5643
5644     for (svp = patternp; svp < patternp + pat_count; svp++) {
5645         SV *sv;
5646         SV *rx  = NULL;
5647         STRLEN orig_patlen = 0;
5648         bool code = 0;
5649         SV *msv = use_delim ? delim : *svp;
5650         if (!msv) msv = &PL_sv_undef;
5651
5652         /* if we've got a delimiter, we go round the loop twice for each
5653          * svp slot (except the last), using the delimiter the second
5654          * time round */
5655         if (use_delim) {
5656             svp--;
5657             use_delim = FALSE;
5658         }
5659         else if (delim)
5660             use_delim = TRUE;
5661
5662         if (SvTYPE(msv) == SVt_PVAV) {
5663             /* we've encountered an interpolated array within
5664              * the pattern, e.g. /...@a..../. Expand the list of elements,
5665              * then recursively append elements.
5666              * The code in this block is based on S_pushav() */
5667
5668             AV *const av = (AV*)msv;
5669             const SSize_t maxarg = AvFILL(av) + 1;
5670             SV **array;
5671
5672             if (oplist) {
5673                 assert(oplist->op_type == OP_PADAV
5674                     || oplist->op_type == OP_RV2AV);
5675                 oplist = oplist->op_sibling;;
5676             }
5677
5678             if (SvRMAGICAL(av)) {
5679                 SSize_t i;
5680
5681                 Newx(array, maxarg, SV*);
5682                 SAVEFREEPV(array);
5683                 for (i=0; i < maxarg; i++) {
5684                     SV ** const svp = av_fetch(av, i, FALSE);
5685                     array[i] = svp ? *svp : &PL_sv_undef;
5686                 }
5687             }
5688             else
5689                 array = AvARRAY(av);
5690
5691             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5692                                 array, maxarg, NULL, recompile_p,
5693                                 /* $" */
5694                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5695
5696             continue;
5697         }
5698
5699
5700         /* we make the assumption here that each op in the list of
5701          * op_siblings maps to one SV pushed onto the stack,
5702          * except for code blocks, with have both an OP_NULL and
5703          * and OP_CONST.
5704          * This allows us to match up the list of SVs against the
5705          * list of OPs to find the next code block.
5706          *
5707          * Note that       PUSHMARK PADSV PADSV ..
5708          * is optimised to
5709          *                 PADRANGE PADSV  PADSV  ..
5710          * so the alignment still works. */
5711
5712         if (oplist) {
5713             if (oplist->op_type == OP_NULL
5714                 && (oplist->op_flags & OPf_SPECIAL))
5715             {
5716                 assert(n < pRExC_state->num_code_blocks);
5717                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5718                 pRExC_state->code_blocks[n].block = oplist;
5719                 pRExC_state->code_blocks[n].src_regex = NULL;
5720                 n++;
5721                 code = 1;
5722                 oplist = oplist->op_sibling; /* skip CONST */
5723                 assert(oplist);
5724             }
5725             oplist = oplist->op_sibling;;
5726         }
5727
5728         /* apply magic and QR overloading to arg */
5729
5730         SvGETMAGIC(msv);
5731         if (SvROK(msv) && SvAMAGIC(msv)) {
5732             SV *sv = AMG_CALLunary(msv, regexp_amg);
5733             if (sv) {
5734                 if (SvROK(sv))
5735                     sv = SvRV(sv);
5736                 if (SvTYPE(sv) != SVt_REGEXP)
5737                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5738                 msv = sv;
5739             }
5740         }
5741
5742         /* try concatenation overload ... */
5743         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5744                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5745         {
5746             sv_setsv(pat, sv);
5747             /* overloading involved: all bets are off over literal
5748              * code. Pretend we haven't seen it */
5749             pRExC_state->num_code_blocks -= n;
5750             n = 0;
5751         }
5752         else  {
5753             /* ... or failing that, try "" overload */
5754             while (SvAMAGIC(msv)
5755                     && (sv = AMG_CALLunary(msv, string_amg))
5756                     && sv != msv
5757                     &&  !(   SvROK(msv)
5758                           && SvROK(sv)
5759                           && SvRV(msv) == SvRV(sv))
5760             ) {
5761                 msv = sv;
5762                 SvGETMAGIC(msv);
5763             }
5764             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5765                 msv = SvRV(msv);
5766
5767             if (pat) {
5768                 /* this is a partially unrolled
5769                  *     sv_catsv_nomg(pat, msv);
5770                  * that allows us to adjust code block indices if
5771                  * needed */
5772                 STRLEN dlen;
5773                 char *dst = SvPV_force_nomg(pat, dlen);
5774                 orig_patlen = dlen;
5775                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5776                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5777                     sv_setpvn(pat, dst, dlen);
5778                     SvUTF8_on(pat);
5779                 }
5780                 sv_catsv_nomg(pat, msv);
5781                 rx = msv;
5782             }
5783             else
5784                 pat = msv;
5785
5786             if (code)
5787                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5788         }
5789
5790         /* extract any code blocks within any embedded qr//'s */
5791         if (rx && SvTYPE(rx) == SVt_REGEXP
5792             && RX_ENGINE((REGEXP*)rx)->op_comp)
5793         {
5794
5795             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5796             if (ri->num_code_blocks) {
5797                 int i;
5798                 /* the presence of an embedded qr// with code means
5799                  * we should always recompile: the text of the
5800                  * qr// may not have changed, but it may be a
5801                  * different closure than last time */
5802                 *recompile_p = 1;
5803                 Renew(pRExC_state->code_blocks,
5804                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5805                     struct reg_code_block);
5806                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5807
5808                 for (i=0; i < ri->num_code_blocks; i++) {
5809                     struct reg_code_block *src, *dst;
5810                     STRLEN offset =  orig_patlen
5811                         + ReANY((REGEXP *)rx)->pre_prefix;
5812                     assert(n < pRExC_state->num_code_blocks);
5813                     src = &ri->code_blocks[i];
5814                     dst = &pRExC_state->code_blocks[n];
5815                     dst->start      = src->start + offset;
5816                     dst->end        = src->end   + offset;
5817                     dst->block      = src->block;
5818                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5819                                             src->src_regex
5820                                                 ? src->src_regex
5821                                                 : (REGEXP*)rx);
5822                     n++;
5823                 }
5824             }
5825         }
5826     }
5827     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5828     if (alloced)
5829         SvSETMAGIC(pat);
5830
5831     return pat;
5832 }
5833
5834
5835
5836 /* see if there are any run-time code blocks in the pattern.
5837  * False positives are allowed */
5838
5839 static bool
5840 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5841                     char *pat, STRLEN plen)
5842 {
5843     int n = 0;
5844     STRLEN s;
5845
5846     for (s = 0; s < plen; s++) {
5847         if (n < pRExC_state->num_code_blocks
5848             && s == pRExC_state->code_blocks[n].start)
5849         {
5850             s = pRExC_state->code_blocks[n].end;
5851             n++;
5852             continue;
5853         }
5854         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5855          * positives here */
5856         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5857             (pat[s+2] == '{'
5858                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5859         )
5860             return 1;
5861     }
5862     return 0;
5863 }
5864
5865 /* Handle run-time code blocks. We will already have compiled any direct
5866  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5867  * copy of it, but with any literal code blocks blanked out and
5868  * appropriate chars escaped; then feed it into
5869  *
5870  *    eval "qr'modified_pattern'"
5871  *
5872  * For example,
5873  *
5874  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5875  *
5876  * becomes
5877  *
5878  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5879  *
5880  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5881  * and merge them with any code blocks of the original regexp.
5882  *
5883  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5884  * instead, just save the qr and return FALSE; this tells our caller that
5885  * the original pattern needs upgrading to utf8.
5886  */
5887
5888 static bool
5889 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5890     char *pat, STRLEN plen)
5891 {
5892     SV *qr;
5893
5894     GET_RE_DEBUG_FLAGS_DECL;
5895
5896     if (pRExC_state->runtime_code_qr) {
5897         /* this is the second time we've been called; this should
5898          * only happen if the main pattern got upgraded to utf8
5899          * during compilation; re-use the qr we compiled first time
5900          * round (which should be utf8 too)
5901          */
5902         qr = pRExC_state->runtime_code_qr;
5903         pRExC_state->runtime_code_qr = NULL;
5904         assert(RExC_utf8 && SvUTF8(qr));
5905     }
5906     else {
5907         int n = 0;
5908         STRLEN s;
5909         char *p, *newpat;
5910         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5911         SV *sv, *qr_ref;
5912         dSP;
5913
5914         /* determine how many extra chars we need for ' and \ escaping */
5915         for (s = 0; s < plen; s++) {
5916             if (pat[s] == '\'' || pat[s] == '\\')
5917                 newlen++;
5918         }
5919
5920         Newx(newpat, newlen, char);
5921         p = newpat;
5922         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5923
5924         for (s = 0; s < plen; s++) {
5925             if (n < pRExC_state->num_code_blocks
5926                 && s == pRExC_state->code_blocks[n].start)
5927             {
5928                 /* blank out literal code block */
5929                 assert(pat[s] == '(');
5930                 while (s <= pRExC_state->code_blocks[n].end) {
5931                     *p++ = '_';
5932                     s++;
5933                 }
5934                 s--;
5935                 n++;
5936                 continue;
5937             }
5938             if (pat[s] == '\'' || pat[s] == '\\')
5939                 *p++ = '\\';
5940             *p++ = pat[s];
5941         }
5942         *p++ = '\'';
5943         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5944             *p++ = 'x';
5945         *p++ = '\0';
5946         DEBUG_COMPILE_r({
5947             PerlIO_printf(Perl_debug_log,
5948                 "%sre-parsing pattern for runtime code:%s %s\n",
5949                 PL_colors[4],PL_colors[5],newpat);
5950         });
5951
5952         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5953         Safefree(newpat);
5954
5955         ENTER;
5956         SAVETMPS;
5957         save_re_context();
5958         PUSHSTACKi(PERLSI_REQUIRE);
5959         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5960          * parsing qr''; normally only q'' does this. It also alters
5961          * hints handling */
5962         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5963         SvREFCNT_dec_NN(sv);
5964         SPAGAIN;
5965         qr_ref = POPs;
5966         PUTBACK;
5967         {
5968             SV * const errsv = ERRSV;
5969             if (SvTRUE_NN(errsv))
5970             {
5971                 Safefree(pRExC_state->code_blocks);
5972                 /* use croak_sv ? */
5973                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5974             }
5975         }
5976         assert(SvROK(qr_ref));
5977         qr = SvRV(qr_ref);
5978         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5979         /* the leaving below frees the tmp qr_ref.
5980          * Give qr a life of its own */
5981         SvREFCNT_inc(qr);
5982         POPSTACK;
5983         FREETMPS;
5984         LEAVE;
5985
5986     }
5987
5988     if (!RExC_utf8 && SvUTF8(qr)) {
5989         /* first time through; the pattern got upgraded; save the
5990          * qr for the next time through */
5991         assert(!pRExC_state->runtime_code_qr);
5992         pRExC_state->runtime_code_qr = qr;
5993         return 0;
5994     }
5995
5996
5997     /* extract any code blocks within the returned qr//  */
5998
5999
6000     /* merge the main (r1) and run-time (r2) code blocks into one */
6001     {
6002         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6003         struct reg_code_block *new_block, *dst;
6004         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6005         int i1 = 0, i2 = 0;
6006
6007         if (!r2->num_code_blocks) /* we guessed wrong */
6008         {
6009             SvREFCNT_dec_NN(qr);
6010             return 1;
6011         }
6012
6013         Newx(new_block,
6014             r1->num_code_blocks + r2->num_code_blocks,
6015             struct reg_code_block);
6016         dst = new_block;
6017
6018         while (    i1 < r1->num_code_blocks
6019                 || i2 < r2->num_code_blocks)
6020         {
6021             struct reg_code_block *src;
6022             bool is_qr = 0;
6023
6024             if (i1 == r1->num_code_blocks) {
6025                 src = &r2->code_blocks[i2++];
6026                 is_qr = 1;
6027             }
6028             else if (i2 == r2->num_code_blocks)
6029                 src = &r1->code_blocks[i1++];
6030             else if (  r1->code_blocks[i1].start
6031                      < r2->code_blocks[i2].start)
6032             {
6033                 src = &r1->code_blocks[i1++];
6034                 assert(src->end < r2->code_blocks[i2].start);
6035             }
6036             else {
6037                 assert(  r1->code_blocks[i1].start
6038                        > r2->code_blocks[i2].start);
6039                 src = &r2->code_blocks[i2++];
6040                 is_qr = 1;
6041                 assert(src->end < r1->code_blocks[i1].start);
6042             }
6043
6044             assert(pat[src->start] == '(');
6045             assert(pat[src->end]   == ')');
6046             dst->start      = src->start;
6047             dst->end        = src->end;
6048             dst->block      = src->block;
6049             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6050                                     : src->src_regex;
6051             dst++;
6052         }
6053         r1->num_code_blocks += r2->num_code_blocks;
6054         Safefree(r1->code_blocks);
6055         r1->code_blocks = new_block;
6056     }
6057
6058     SvREFCNT_dec_NN(qr);
6059     return 1;
6060 }
6061
6062
6063 STATIC bool
6064 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6065                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6066                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6067                       STRLEN longest_length, bool eol, bool meol)
6068 {
6069     /* This is the common code for setting up the floating and fixed length
6070      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6071      * as to whether succeeded or not */
6072
6073     I32 t;
6074     SSize_t ml;
6075
6076     if (! (longest_length
6077            || (eol /* Can't have SEOL and MULTI */
6078                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6079           )
6080             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6081         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6082     {
6083         return FALSE;
6084     }
6085
6086     /* copy the information about the longest from the reg_scan_data
6087         over to the program. */
6088     if (SvUTF8(sv_longest)) {
6089         *rx_utf8 = sv_longest;
6090         *rx_substr = NULL;
6091     } else {
6092         *rx_substr = sv_longest;
6093         *rx_utf8 = NULL;
6094     }
6095     /* end_shift is how many chars that must be matched that
6096         follow this item. We calculate it ahead of time as once the
6097         lookbehind offset is added in we lose the ability to correctly
6098         calculate it.*/
6099     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6100     *rx_end_shift = ml - offset
6101         - longest_length + (SvTAIL(sv_longest) != 0)
6102         + lookbehind;
6103
6104     t = (eol/* Can't have SEOL and MULTI */
6105          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6106     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6107
6108     return TRUE;
6109 }
6110
6111 /*
6112  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6113  * regular expression into internal code.
6114  * The pattern may be passed either as:
6115  *    a list of SVs (patternp plus pat_count)
6116  *    a list of OPs (expr)
6117  * If both are passed, the SV list is used, but the OP list indicates
6118  * which SVs are actually pre-compiled code blocks
6119  *
6120  * The SVs in the list have magic and qr overloading applied to them (and
6121  * the list may be modified in-place with replacement SVs in the latter
6122  * case).
6123  *
6124  * If the pattern hasn't changed from old_re, then old_re will be
6125  * returned.
6126  *
6127  * eng is the current engine. If that engine has an op_comp method, then
6128  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6129  * do the initial concatenation of arguments and pass on to the external
6130  * engine.
6131  *
6132  * If is_bare_re is not null, set it to a boolean indicating whether the
6133  * arg list reduced (after overloading) to a single bare regex which has
6134  * been returned (i.e. /$qr/).
6135  *
6136  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6137  *
6138  * pm_flags contains the PMf_* flags, typically based on those from the
6139  * pm_flags field of the related PMOP. Currently we're only interested in
6140  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6141  *
6142  * We can't allocate space until we know how big the compiled form will be,
6143  * but we can't compile it (and thus know how big it is) until we've got a
6144  * place to put the code.  So we cheat:  we compile it twice, once with code
6145  * generation turned off and size counting turned on, and once "for real".
6146  * This also means that we don't allocate space until we are sure that the
6147  * thing really will compile successfully, and we never have to move the
6148  * code and thus invalidate pointers into it.  (Note that it has to be in
6149  * one piece because free() must be able to free it all.) [NB: not true in perl]
6150  *
6151  * Beware that the optimization-preparation code in here knows about some
6152  * of the structure of the compiled regexp.  [I'll say.]
6153  */
6154
6155 REGEXP *
6156 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6157                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6158                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6159 {
6160     dVAR;
6161     REGEXP *rx;
6162     struct regexp *r;
6163     regexp_internal *ri;
6164     STRLEN plen;
6165     char *exp;
6166     regnode *scan;
6167     I32 flags;
6168     SSize_t minlen = 0;
6169     U32 rx_flags;
6170     SV *pat;
6171     SV *code_blocksv = NULL;
6172     SV** new_patternp = patternp;
6173
6174     /* these are all flags - maybe they should be turned
6175      * into a single int with different bit masks */
6176     I32 sawlookahead = 0;
6177     I32 sawplus = 0;
6178     I32 sawopen = 0;
6179     I32 sawminmod = 0;
6180
6181     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6182     bool recompile = 0;
6183     bool runtime_code = 0;
6184     scan_data_t data;
6185     RExC_state_t RExC_state;
6186     RExC_state_t * const pRExC_state = &RExC_state;
6187 #ifdef TRIE_STUDY_OPT
6188     int restudied = 0;
6189     RExC_state_t copyRExC_state;
6190 #endif
6191     GET_RE_DEBUG_FLAGS_DECL;
6192
6193     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6194
6195     DEBUG_r(if (!PL_colorset) reginitcolors());
6196
6197 #ifndef PERL_IN_XSUB_RE
6198     /* Initialize these here instead of as-needed, as is quick and avoids
6199      * having to test them each time otherwise */
6200     if (! PL_AboveLatin1) {
6201         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6202         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6203         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6204         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6205         PL_HasMultiCharFold =
6206                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6207     }
6208 #endif
6209
6210     pRExC_state->code_blocks = NULL;
6211     pRExC_state->num_code_blocks = 0;
6212
6213     if (is_bare_re)
6214         *is_bare_re = FALSE;
6215
6216     if (expr && (expr->op_type == OP_LIST ||
6217                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6218         /* allocate code_blocks if needed */
6219         OP *o;
6220         int ncode = 0;
6221
6222         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6223             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6224                 ncode++; /* count of DO blocks */
6225         if (ncode) {
6226             pRExC_state->num_code_blocks = ncode;
6227             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6228         }
6229     }
6230
6231     if (!pat_count) {
6232         /* compile-time pattern with just OP_CONSTs and DO blocks */
6233
6234         int n;
6235         OP *o;
6236
6237         /* find how many CONSTs there are */
6238         assert(expr);
6239         n = 0;
6240         if (expr->op_type == OP_CONST)
6241             n = 1;
6242         else
6243             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6244                 if (o->op_type == OP_CONST)
6245                     n++;
6246             }
6247
6248         /* fake up an SV array */
6249
6250         assert(!new_patternp);
6251         Newx(new_patternp, n, SV*);
6252         SAVEFREEPV(new_patternp);
6253         pat_count = n;
6254
6255         n = 0;
6256         if (expr->op_type == OP_CONST)
6257             new_patternp[n] = cSVOPx_sv(expr);
6258         else
6259             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6260                 if (o->op_type == OP_CONST)
6261                     new_patternp[n++] = cSVOPo_sv;
6262             }
6263
6264     }
6265
6266     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6267         "Assembling pattern from %d elements%s\n", pat_count,
6268             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6269
6270     /* set expr to the first arg op */
6271
6272     if (pRExC_state->num_code_blocks
6273          && expr->op_type != OP_CONST)
6274     {
6275             expr = cLISTOPx(expr)->op_first;
6276             assert(   expr->op_type == OP_PUSHMARK
6277                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6278                    || expr->op_type == OP_PADRANGE);
6279             expr = expr->op_sibling;
6280     }
6281
6282     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6283                         expr, &recompile, NULL);
6284
6285     /* handle bare (possibly after overloading) regex: foo =~ $re */
6286     {
6287         SV *re = pat;
6288         if (SvROK(re))
6289             re = SvRV(re);
6290         if (SvTYPE(re) == SVt_REGEXP) {
6291             if (is_bare_re)
6292                 *is_bare_re = TRUE;
6293             SvREFCNT_inc(re);
6294             Safefree(pRExC_state->code_blocks);
6295             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6296                 "Precompiled pattern%s\n",
6297                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6298
6299             return (REGEXP*)re;
6300         }
6301     }
6302
6303     exp = SvPV_nomg(pat, plen);
6304
6305     if (!eng->op_comp) {
6306         if ((SvUTF8(pat) && IN_BYTES)
6307                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6308         {
6309             /* make a temporary copy; either to convert to bytes,
6310              * or to avoid repeating get-magic / overloaded stringify */
6311             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6312                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6313         }
6314         Safefree(pRExC_state->code_blocks);
6315         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6316     }
6317
6318     /* ignore the utf8ness if the pattern is 0 length */
6319     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6320     RExC_uni_semantics = 0;
6321     RExC_contains_locale = 0;
6322     RExC_contains_i = 0;
6323     pRExC_state->runtime_code_qr = NULL;
6324
6325     DEBUG_COMPILE_r({
6326             SV *dsv= sv_newmortal();
6327             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6328             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6329                           PL_colors[4],PL_colors[5],s);
6330         });
6331
6332   redo_first_pass:
6333     /* we jump here if we upgrade the pattern to utf8 and have to
6334      * recompile */
6335
6336     if ((pm_flags & PMf_USE_RE_EVAL)
6337                 /* this second condition covers the non-regex literal case,
6338                  * i.e.  $foo =~ '(?{})'. */
6339                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6340     )
6341         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6342
6343     /* return old regex if pattern hasn't changed */
6344     /* XXX: note in the below we have to check the flags as well as the
6345      * pattern.
6346      *
6347      * Things get a touch tricky as we have to compare the utf8 flag
6348      * independently from the compile flags.  */
6349
6350     if (   old_re
6351         && !recompile
6352         && !!RX_UTF8(old_re) == !!RExC_utf8
6353         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6354         && RX_PRECOMP(old_re)
6355         && RX_PRELEN(old_re) == plen
6356         && memEQ(RX_PRECOMP(old_re), exp, plen)
6357         && !runtime_code /* with runtime code, always recompile */ )
6358     {
6359         Safefree(pRExC_state->code_blocks);
6360         return old_re;
6361     }
6362
6363     rx_flags = orig_rx_flags;
6364
6365     if (rx_flags & PMf_FOLD) {
6366         RExC_contains_i = 1;
6367     }
6368     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6369
6370         /* Set to use unicode semantics if the pattern is in utf8 and has the
6371          * 'depends' charset specified, as it means unicode when utf8  */
6372         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6373     }
6374
6375     RExC_precomp = exp;
6376     RExC_flags = rx_flags;
6377     RExC_pm_flags = pm_flags;
6378
6379     if (runtime_code) {
6380         if (TAINTING_get && TAINT_get)
6381             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6382
6383         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6384             /* whoops, we have a non-utf8 pattern, whilst run-time code
6385              * got compiled as utf8. Try again with a utf8 pattern */
6386             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6387                                     pRExC_state->num_code_blocks);
6388             goto redo_first_pass;
6389         }
6390     }
6391     assert(!pRExC_state->runtime_code_qr);
6392
6393     RExC_sawback = 0;
6394
6395     RExC_seen = 0;
6396     RExC_maxlen = 0;
6397     RExC_in_lookbehind = 0;
6398     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6399     RExC_extralen = 0;
6400     RExC_override_recoding = 0;
6401     RExC_in_multi_char_class = 0;
6402
6403     /* First pass: determine size, legality. */
6404     RExC_parse = exp;
6405     RExC_start = exp;
6406     RExC_end = exp + plen;
6407     RExC_naughty = 0;
6408     RExC_npar = 1;
6409     RExC_nestroot = 0;
6410     RExC_size = 0L;
6411     RExC_emit = (regnode *) &RExC_emit_dummy;
6412     RExC_whilem_seen = 0;
6413     RExC_open_parens = NULL;
6414     RExC_close_parens = NULL;
6415     RExC_opend = NULL;
6416     RExC_paren_names = NULL;
6417 #ifdef DEBUGGING
6418     RExC_paren_name_list = NULL;
6419 #endif
6420     RExC_recurse = NULL;
6421     RExC_study_chunk_recursed = NULL;
6422     RExC_study_chunk_recursed_bytes= 0;
6423     RExC_recurse_count = 0;
6424     pRExC_state->code_index = 0;
6425
6426 #if 0 /* REGC() is (currently) a NOP at the first pass.
6427        * Clever compilers notice this and complain. --jhi */
6428     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6429 #endif
6430     DEBUG_PARSE_r(
6431         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6432         RExC_lastnum=0;
6433         RExC_lastparse=NULL;
6434     );
6435     /* reg may croak on us, not giving us a chance to free
6436        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6437        need it to survive as long as the regexp (qr/(?{})/).
6438        We must check that code_blocksv is not already set, because we may
6439        have jumped back to restart the sizing pass. */
6440     if (pRExC_state->code_blocks && !code_blocksv) {
6441         code_blocksv = newSV_type(SVt_PV);
6442         SAVEFREESV(code_blocksv);
6443         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6444         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6445     }
6446     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6447         /* It's possible to write a regexp in ascii that represents Unicode
6448         codepoints outside of the byte range, such as via \x{100}. If we
6449         detect such a sequence we have to convert the entire pattern to utf8
6450         and then recompile, as our sizing calculation will have been based
6451         on 1 byte == 1 character, but we will need to use utf8 to encode
6452         at least some part of the pattern, and therefore must convert the whole
6453         thing.
6454         -- dmq */
6455         if (flags & RESTART_UTF8) {
6456             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6457                                     pRExC_state->num_code_blocks);
6458             goto redo_first_pass;
6459         }
6460         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6461     }
6462     if (code_blocksv)
6463         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6464
6465     DEBUG_PARSE_r({
6466         PerlIO_printf(Perl_debug_log,
6467             "Required size %"IVdf" nodes\n"
6468             "Starting second pass (creation)\n",
6469             (IV)RExC_size);
6470         RExC_lastnum=0;
6471         RExC_lastparse=NULL;
6472     });
6473
6474     /* The first pass could have found things that force Unicode semantics */
6475     if ((RExC_utf8 || RExC_uni_semantics)
6476          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6477     {
6478         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6479     }
6480
6481     /* Small enough for pointer-storage convention?
6482        If extralen==0, this means that we will not need long jumps. */
6483     if (RExC_size >= 0x10000L && RExC_extralen)
6484         RExC_size += RExC_extralen;
6485     else
6486         RExC_extralen = 0;
6487     if (RExC_whilem_seen > 15)
6488         RExC_whilem_seen = 15;
6489
6490     /* Allocate space and zero-initialize. Note, the two step process
6491        of zeroing when in debug mode, thus anything assigned has to
6492        happen after that */
6493     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6494     r = ReANY(rx);
6495     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6496          char, regexp_internal);
6497     if ( r == NULL || ri == NULL )
6498         FAIL("Regexp out of space");
6499 #ifdef DEBUGGING
6500     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6501     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6502          char);
6503 #else
6504     /* bulk initialize base fields with 0. */
6505     Zero(ri, sizeof(regexp_internal), char);
6506 #endif
6507
6508     /* non-zero initialization begins here */
6509     RXi_SET( r, ri );
6510     r->engine= eng;
6511     r->extflags = rx_flags;
6512     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6513
6514     if (pm_flags & PMf_IS_QR) {
6515         ri->code_blocks = pRExC_state->code_blocks;
6516         ri->num_code_blocks = pRExC_state->num_code_blocks;
6517     }
6518     else
6519     {
6520         int n;
6521         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6522             if (pRExC_state->code_blocks[n].src_regex)
6523                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6524         SAVEFREEPV(pRExC_state->code_blocks);
6525     }
6526
6527     {
6528         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6529         bool has_charset = (get_regex_charset(r->extflags)
6530                                                     != REGEX_DEPENDS_CHARSET);
6531
6532         /* The caret is output if there are any defaults: if not all the STD
6533          * flags are set, or if no character set specifier is needed */
6534         bool has_default =
6535                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6536                     || ! has_charset);
6537         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6538                                                    == REG_RUN_ON_COMMENT_SEEN);
6539         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6540                             >> RXf_PMf_STD_PMMOD_SHIFT);
6541         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6542         char *p;
6543         /* Allocate for the worst case, which is all the std flags are turned
6544          * on.  If more precision is desired, we could do a population count of
6545          * the flags set.  This could be done with a small lookup table, or by
6546          * shifting, masking and adding, or even, when available, assembly
6547          * language for a machine-language population count.
6548          * We never output a minus, as all those are defaults, so are
6549          * covered by the caret */
6550         const STRLEN wraplen = plen + has_p + has_runon
6551             + has_default       /* If needs a caret */
6552
6553                 /* If needs a character set specifier */
6554             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6555             + (sizeof(STD_PAT_MODS) - 1)
6556             + (sizeof("(?:)") - 1);
6557
6558         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6559         r->xpv_len_u.xpvlenu_pv = p;
6560         if (RExC_utf8)
6561             SvFLAGS(rx) |= SVf_UTF8;
6562         *p++='('; *p++='?';
6563
6564         /* If a default, cover it using the caret */
6565         if (has_default) {
6566             *p++= DEFAULT_PAT_MOD;
6567         }
6568         if (has_charset) {
6569             STRLEN len;
6570             const char* const name = get_regex_charset_name(r->extflags, &len);
6571             Copy(name, p, len, char);
6572             p += len;
6573         }
6574         if (has_p)
6575             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6576         {
6577             char ch;
6578             while((ch = *fptr++)) {
6579                 if(reganch & 1)
6580                     *p++ = ch;
6581                 reganch >>= 1;
6582             }
6583         }
6584
6585         *p++ = ':';
6586         Copy(RExC_precomp, p, plen, char);
6587         assert ((RX_WRAPPED(rx) - p) < 16);
6588         r->pre_prefix = p - RX_WRAPPED(rx);
6589         p += plen;
6590         if (has_runon)
6591             *p++ = '\n';
6592         *p++ = ')';
6593         *p = 0;
6594         SvCUR_set(rx, p - RX_WRAPPED(rx));
6595     }
6596
6597     r->intflags = 0;
6598     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6599
6600     /* setup various meta data about recursion, this all requires
6601      * RExC_npar to be correctly set, and a bit later on we clear it */
6602     if (RExC_seen & REG_RECURSE_SEEN) {
6603         Newxz(RExC_open_parens, RExC_npar,regnode *);
6604         SAVEFREEPV(RExC_open_parens);
6605         Newxz(RExC_close_parens,RExC_npar,regnode *);
6606         SAVEFREEPV(RExC_close_parens);
6607     }
6608     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6609         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6610          * So its 1 if there are no parens. */
6611         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6612                                          ((RExC_npar & 0x07) != 0);
6613         Newx(RExC_study_chunk_recursed,
6614              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6615         SAVEFREEPV(RExC_study_chunk_recursed);
6616     }
6617
6618     /* Useful during FAIL. */
6619 #ifdef RE_TRACK_PATTERN_OFFSETS
6620     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6621     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6622                           "%s %"UVuf" bytes for offset annotations.\n",
6623                           ri->u.offsets ? "Got" : "Couldn't get",
6624                           (UV)((2*RExC_size+1) * sizeof(U32))));
6625 #endif
6626     SetProgLen(ri,RExC_size);
6627     RExC_rx_sv = rx;
6628     RExC_rx = r;
6629     RExC_rxi = ri;
6630
6631     /* Second pass: emit code. */
6632     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6633     RExC_pm_flags = pm_flags;
6634     RExC_parse = exp;
6635     RExC_end = exp + plen;
6636     RExC_naughty = 0;
6637     RExC_npar = 1;
6638     RExC_emit_start = ri->program;
6639     RExC_emit = ri->program;
6640     RExC_emit_bound = ri->program + RExC_size + 1;
6641     pRExC_state->code_index = 0;
6642
6643     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6644     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6645         ReREFCNT_dec(rx);
6646         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6647     }
6648     /* XXXX To minimize changes to RE engine we always allocate
6649        3-units-long substrs field. */
6650     Newx(r->substrs, 1, struct reg_substr_data);
6651     if (RExC_recurse_count) {
6652         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6653         SAVEFREEPV(RExC_recurse);
6654     }
6655
6656 reStudy:
6657     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6658     Zero(r->substrs, 1, struct reg_substr_data);
6659     if (RExC_study_chunk_recursed)
6660         Zero(RExC_study_chunk_recursed,
6661              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6662
6663 #ifdef TRIE_STUDY_OPT
6664     if (!restudied) {
6665         StructCopy(&zero_scan_data, &data, scan_data_t);
6666         copyRExC_state = RExC_state;
6667     } else {
6668         U32 seen=RExC_seen;
6669         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6670
6671         RExC_state = copyRExC_state;
6672         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6673             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6674         else
6675             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6676         StructCopy(&zero_scan_data, &data, scan_data_t);
6677     }
6678 #else
6679     StructCopy(&zero_scan_data, &data, scan_data_t);
6680 #endif
6681
6682     /* Dig out information for optimizations. */
6683     r->extflags = RExC_flags; /* was pm_op */
6684     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6685
6686     if (UTF)
6687         SvUTF8_on(rx);  /* Unicode in it? */
6688     ri->regstclass = NULL;
6689     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6690         r->intflags |= PREGf_NAUGHTY;
6691     scan = ri->program + 1;             /* First BRANCH. */
6692
6693     /* testing for BRANCH here tells us whether there is "must appear"
6694        data in the pattern. If there is then we can use it for optimisations */
6695     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6696                                                   */
6697         SSize_t fake;
6698         STRLEN longest_float_length, longest_fixed_length;
6699         regnode_ssc ch_class; /* pointed to by data */
6700         int stclass_flag;
6701         SSize_t last_close = 0; /* pointed to by data */
6702         regnode *first= scan;
6703         regnode *first_next= regnext(first);
6704         /*
6705          * Skip introductions and multiplicators >= 1
6706          * so that we can extract the 'meat' of the pattern that must
6707          * match in the large if() sequence following.
6708          * NOTE that EXACT is NOT covered here, as it is normally
6709          * picked up by the optimiser separately.
6710          *
6711          * This is unfortunate as the optimiser isnt handling lookahead
6712          * properly currently.
6713          *
6714          */
6715         while ((OP(first) == OPEN && (sawopen = 1)) ||
6716                /* An OR of *one* alternative - should not happen now. */
6717             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6718             /* for now we can't handle lookbehind IFMATCH*/
6719             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6720             (OP(first) == PLUS) ||
6721             (OP(first) == MINMOD) ||
6722                /* An {n,m} with n>0 */
6723             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6724             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6725         {
6726                 /*
6727                  * the only op that could be a regnode is PLUS, all the rest
6728                  * will be regnode_1 or regnode_2.
6729                  *
6730                  * (yves doesn't think this is true)
6731                  */
6732                 if (OP(first) == PLUS)
6733                     sawplus = 1;
6734                 else {
6735                     if (OP(first) == MINMOD)
6736                         sawminmod = 1;
6737                     first += regarglen[OP(first)];
6738                 }
6739                 first = NEXTOPER(first);
6740                 first_next= regnext(first);
6741         }
6742
6743         /* Starting-point info. */
6744       again:
6745         DEBUG_PEEP("first:",first,0);
6746         /* Ignore EXACT as we deal with it later. */
6747         if (PL_regkind[OP(first)] == EXACT) {
6748             if (OP(first) == EXACT)
6749                 NOOP;   /* Empty, get anchored substr later. */
6750             else
6751                 ri->regstclass = first;
6752         }
6753 #ifdef TRIE_STCLASS
6754         else if (PL_regkind[OP(first)] == TRIE &&
6755                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6756         {
6757             regnode *trie_op;
6758             /* this can happen only on restudy */
6759             if ( OP(first) == TRIE ) {
6760                 struct regnode_1 *trieop = (struct regnode_1 *)
6761                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6762                 StructCopy(first,trieop,struct regnode_1);
6763                 trie_op=(regnode *)trieop;
6764             } else {
6765                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6766                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6767                 StructCopy(first,trieop,struct regnode_charclass);
6768                 trie_op=(regnode *)trieop;
6769             }
6770             OP(trie_op)+=2;
6771             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6772             ri->regstclass = trie_op;
6773         }
6774 #endif
6775         else if (REGNODE_SIMPLE(OP(first)))
6776             ri->regstclass = first;
6777         else if (PL_regkind[OP(first)] == BOUND ||
6778                  PL_regkind[OP(first)] == NBOUND)
6779             ri->regstclass = first;
6780         else if (PL_regkind[OP(first)] == BOL) {
6781             r->intflags |= (OP(first) == MBOL
6782                            ? PREGf_ANCH_MBOL
6783                            : (OP(first) == SBOL
6784                               ? PREGf_ANCH_SBOL
6785                               : PREGf_ANCH_BOL));
6786             first = NEXTOPER(first);
6787             goto again;
6788         }
6789         else if (OP(first) == GPOS) {
6790             r->intflags |= PREGf_ANCH_GPOS;
6791             first = NEXTOPER(first);
6792             goto again;
6793         }
6794         else if ((!sawopen || !RExC_sawback) &&
6795             (OP(first) == STAR &&
6796             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6797             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6798         {
6799             /* turn .* into ^.* with an implied $*=1 */
6800             const int type =
6801                 (OP(NEXTOPER(first)) == REG_ANY)
6802                     ? PREGf_ANCH_MBOL
6803                     : PREGf_ANCH_SBOL;
6804             r->intflags |= (type | PREGf_IMPLICIT);
6805             first = NEXTOPER(first);
6806             goto again;
6807         }
6808         if (sawplus && !sawminmod && !sawlookahead
6809             && (!sawopen || !RExC_sawback)
6810             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6811             /* x+ must match at the 1st pos of run of x's */
6812             r->intflags |= PREGf_SKIP;
6813
6814         /* Scan is after the zeroth branch, first is atomic matcher. */
6815 #ifdef TRIE_STUDY_OPT
6816         DEBUG_PARSE_r(
6817             if (!restudied)
6818                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6819                               (IV)(first - scan + 1))
6820         );
6821 #else
6822         DEBUG_PARSE_r(
6823             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6824                 (IV)(first - scan + 1))
6825         );
6826 #endif
6827
6828
6829         /*
6830         * If there's something expensive in the r.e., find the
6831         * longest literal string that must appear and make it the
6832         * regmust.  Resolve ties in favor of later strings, since
6833         * the regstart check works with the beginning of the r.e.
6834         * and avoiding duplication strengthens checking.  Not a
6835         * strong reason, but sufficient in the absence of others.
6836         * [Now we resolve ties in favor of the earlier string if
6837         * it happens that c_offset_min has been invalidated, since the
6838         * earlier string may buy us something the later one won't.]
6839         */
6840
6841         data.longest_fixed = newSVpvs("");
6842         data.longest_float = newSVpvs("");
6843         data.last_found = newSVpvs("");
6844         data.longest = &(data.longest_fixed);
6845         ENTER_with_name("study_chunk");
6846         SAVEFREESV(data.longest_fixed);
6847         SAVEFREESV(data.longest_float);
6848         SAVEFREESV(data.last_found);
6849         first = scan;
6850         if (!ri->regstclass) {
6851             ssc_init(pRExC_state, &ch_class);
6852             data.start_class = &ch_class;
6853             stclass_flag = SCF_DO_STCLASS_AND;
6854         } else                          /* XXXX Check for BOUND? */
6855             stclass_flag = 0;
6856         data.last_closep = &last_close;
6857
6858         DEBUG_RExC_seen();
6859         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6860                              scan + RExC_size, /* Up to end */
6861             &data, -1, 0, NULL,
6862             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6863                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6864             0);
6865
6866
6867         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6868
6869
6870         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6871              && data.last_start_min == 0 && data.last_end > 0
6872              && !RExC_seen_zerolen
6873              && !(RExC_seen & REG_VERBARG_SEEN)
6874              && !(RExC_seen & REG_GPOS_SEEN)
6875         ){
6876             r->extflags |= RXf_CHECK_ALL;
6877         }
6878         scan_commit(pRExC_state, &data,&minlen,0);
6879
6880         longest_float_length = CHR_SVLEN(data.longest_float);
6881
6882         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6883                    && data.offset_fixed == data.offset_float_min
6884                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6885             && S_setup_longest (aTHX_ pRExC_state,
6886                                     data.longest_float,
6887                                     &(r->float_utf8),
6888                                     &(r->float_substr),
6889                                     &(r->float_end_shift),
6890                                     data.lookbehind_float,
6891                                     data.offset_float_min,
6892                                     data.minlen_float,
6893                                     longest_float_length,
6894                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6895                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6896         {
6897             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6898             r->float_max_offset = data.offset_float_max;
6899             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6900                 r->float_max_offset -= data.lookbehind_float;
6901             SvREFCNT_inc_simple_void_NN(data.longest_float);
6902         }
6903         else {
6904             r->float_substr = r->float_utf8 = NULL;
6905             longest_float_length = 0;
6906         }
6907
6908         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6909
6910         if (S_setup_longest (aTHX_ pRExC_state,
6911                                 data.longest_fixed,
6912                                 &(r->anchored_utf8),
6913                                 &(r->anchored_substr),
6914                                 &(r->anchored_end_shift),
6915                                 data.lookbehind_fixed,
6916                                 data.offset_fixed,
6917                                 data.minlen_fixed,
6918                                 longest_fixed_length,
6919                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6920                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6921         {
6922             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6923             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6924         }
6925         else {
6926             r->anchored_substr = r->anchored_utf8 = NULL;
6927             longest_fixed_length = 0;
6928         }
6929         LEAVE_with_name("study_chunk");
6930
6931         if (ri->regstclass
6932             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6933             ri->regstclass = NULL;
6934
6935         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6936             && stclass_flag
6937             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6938             && !ssc_is_anything(data.start_class))
6939         {
6940             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6941
6942             ssc_finalize(pRExC_state, data.start_class);
6943
6944             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6945             StructCopy(data.start_class,
6946                        (regnode_ssc*)RExC_rxi->data->data[n],
6947                        regnode_ssc);
6948             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6949             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6950             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6951                       regprop(r, sv, (regnode*)data.start_class, NULL);
6952                       PerlIO_printf(Perl_debug_log,
6953                                     "synthetic stclass \"%s\".\n",
6954                                     SvPVX_const(sv));});
6955             data.start_class = NULL;
6956         }
6957
6958         /* A temporary algorithm prefers floated substr to fixed one to dig
6959          * more info. */
6960         if (longest_fixed_length > longest_float_length) {
6961             r->substrs->check_ix = 0;
6962             r->check_end_shift = r->anchored_end_shift;
6963             r->check_substr = r->anchored_substr;
6964             r->check_utf8 = r->anchored_utf8;
6965             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6966             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6967                 r->intflags |= PREGf_NOSCAN;
6968         }
6969         else {
6970             r->substrs->check_ix = 1;
6971             r->check_end_shift = r->float_end_shift;
6972             r->check_substr = r->float_substr;
6973             r->check_utf8 = r->float_utf8;
6974             r->check_offset_min = r->float_min_offset;
6975             r->check_offset_max = r->float_max_offset;
6976         }
6977         if ((r->check_substr || r->check_utf8) ) {
6978             r->extflags |= RXf_USE_INTUIT;
6979             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6980                 r->extflags |= RXf_INTUIT_TAIL;
6981         }
6982         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6983
6984         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6985         if ( (STRLEN)minlen < longest_float_length )
6986             minlen= longest_float_length;
6987         if ( (STRLEN)minlen < longest_fixed_length )
6988             minlen= longest_fixed_length;
6989         */
6990     }
6991     else {
6992         /* Several toplevels. Best we can is to set minlen. */
6993         SSize_t fake;
6994         regnode_ssc ch_class;
6995         SSize_t last_close = 0;
6996
6997         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6998
6999         scan = ri->program + 1;
7000         ssc_init(pRExC_state, &ch_class);
7001         data.start_class = &ch_class;
7002         data.last_closep = &last_close;
7003
7004         DEBUG_RExC_seen();
7005         minlen = study_chunk(pRExC_state,
7006             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7007             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7008                                                       ? SCF_TRIE_DOING_RESTUDY
7009                                                       : 0),
7010             0);
7011
7012         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7013
7014         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7015                 = r->float_substr = r->float_utf8 = NULL;
7016
7017         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7018             && ! ssc_is_anything(data.start_class))
7019         {
7020             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7021
7022             ssc_finalize(pRExC_state, data.start_class);
7023
7024             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7025             StructCopy(data.start_class,
7026                        (regnode_ssc*)RExC_rxi->data->data[n],
7027                        regnode_ssc);
7028             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7029             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7030             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7031                       regprop(r, sv, (regnode*)data.start_class, NULL);
7032                       PerlIO_printf(Perl_debug_log,
7033                                     "synthetic stclass \"%s\".\n",
7034                                     SvPVX_const(sv));});
7035             data.start_class = NULL;
7036         }
7037     }
7038
7039     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7040         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7041         r->maxlen = REG_INFTY;
7042     }
7043     else {
7044         r->maxlen = RExC_maxlen;
7045     }
7046
7047     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7048        the "real" pattern. */
7049     DEBUG_OPTIMISE_r({
7050         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7051                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7052     });
7053     r->minlenret = minlen;
7054     if (r->minlen < minlen)
7055         r->minlen = minlen;
7056
7057     if (RExC_seen & REG_GPOS_SEEN)
7058         r->intflags |= PREGf_GPOS_SEEN;
7059     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7060         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7061                                                 lookbehind */
7062     if (pRExC_state->num_code_blocks)
7063         r->extflags |= RXf_EVAL_SEEN;
7064     if (RExC_seen & REG_CANY_SEEN)
7065         r->intflags |= PREGf_CANY_SEEN;
7066     if (RExC_seen & REG_VERBARG_SEEN)
7067     {
7068         r->intflags |= PREGf_VERBARG_SEEN;
7069         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7070     }
7071     if (RExC_seen & REG_CUTGROUP_SEEN)
7072         r->intflags |= PREGf_CUTGROUP_SEEN;
7073     if (pm_flags & PMf_USE_RE_EVAL)
7074         r->intflags |= PREGf_USE_RE_EVAL;
7075     if (RExC_paren_names)
7076         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7077     else
7078         RXp_PAREN_NAMES(r) = NULL;
7079
7080     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7081      * so it can be used in pp.c */
7082     if (r->intflags & PREGf_ANCH)
7083         r->extflags |= RXf_IS_ANCHORED;
7084
7085
7086     {
7087         /* this is used to identify "special" patterns that might result
7088          * in Perl NOT calling the regex engine and instead doing the match "itself",
7089          * particularly special cases in split//. By having the regex compiler
7090          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7091          * we avoid weird issues with equivalent patterns resulting in different behavior,
7092          * AND we allow non Perl engines to get the same optimizations by the setting the
7093          * flags appropriately - Yves */
7094         regnode *first = ri->program + 1;
7095         U8 fop = OP(first);
7096         regnode *next = NEXTOPER(first);
7097         U8 nop = OP(next);
7098
7099         if (PL_regkind[fop] == NOTHING && nop == END)
7100             r->extflags |= RXf_NULL;
7101         else if (PL_regkind[fop] == BOL && nop == END)
7102             r->extflags |= RXf_START_ONLY;
7103         else if (fop == PLUS
7104                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7105                  && OP(regnext(first)) == END)
7106             r->extflags |= RXf_WHITE;
7107         else if ( r->extflags & RXf_SPLIT
7108                   && fop == EXACT
7109                   && STR_LEN(first) == 1
7110                   && *(STRING(first)) == ' '
7111                   && OP(regnext(first)) == END )
7112             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7113
7114     }
7115
7116     if (RExC_contains_locale) {
7117         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7118     }
7119
7120 #ifdef DEBUGGING
7121     if (RExC_paren_names) {
7122         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7123         ri->data->data[ri->name_list_idx]
7124                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7125     } else
7126 #endif
7127         ri->name_list_idx = 0;
7128
7129     if (RExC_recurse_count) {
7130         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7131             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7132             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7133         }
7134     }
7135     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7136     /* assume we don't need to swap parens around before we match */
7137
7138     DEBUG_DUMP_r({
7139         DEBUG_RExC_seen();
7140         PerlIO_printf(Perl_debug_log,"Final program:\n");
7141         regdump(r);
7142     });
7143 #ifdef RE_TRACK_PATTERN_OFFSETS
7144     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7145         const STRLEN len = ri->u.offsets[0];
7146         STRLEN i;
7147         GET_RE_DEBUG_FLAGS_DECL;
7148         PerlIO_printf(Perl_debug_log,
7149                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7150         for (i = 1; i <= len; i++) {
7151             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7152                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7153                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7154             }
7155         PerlIO_printf(Perl_debug_log, "\n");
7156     });
7157 #endif
7158
7159 #ifdef USE_ITHREADS
7160     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7161      * by setting the regexp SV to readonly-only instead. If the
7162      * pattern's been recompiled, the USEDness should remain. */
7163     if (old_re && SvREADONLY(old_re))
7164         SvREADONLY_on(rx);
7165 #endif
7166     return rx;
7167 }
7168
7169
7170 SV*
7171 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7172                     const U32 flags)
7173 {
7174     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7175
7176     PERL_UNUSED_ARG(value);
7177
7178     if (flags & RXapif_FETCH) {
7179         return reg_named_buff_fetch(rx, key, flags);
7180     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7181         Perl_croak_no_modify();
7182         return NULL;
7183     } else if (flags & RXapif_EXISTS) {
7184         return reg_named_buff_exists(rx, key, flags)
7185             ? &PL_sv_yes
7186             : &PL_sv_no;
7187     } else if (flags & RXapif_REGNAMES) {
7188         return reg_named_buff_all(rx, flags);
7189     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7190         return reg_named_buff_scalar(rx, flags);
7191     } else {
7192         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7193         return NULL;
7194     }
7195 }
7196
7197 SV*
7198 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7199                          const U32 flags)
7200 {
7201     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7202     PERL_UNUSED_ARG(lastkey);
7203
7204     if (flags & RXapif_FIRSTKEY)
7205         return reg_named_buff_firstkey(rx, flags);
7206     else if (flags & RXapif_NEXTKEY)
7207         return reg_named_buff_nextkey(rx, flags);
7208     else {
7209         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7210                                             (int)flags);
7211         return NULL;
7212     }
7213 }
7214
7215 SV*
7216 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7217                           const U32 flags)
7218 {
7219     AV *retarray = NULL;
7220     SV *ret;
7221     struct regexp *const rx = ReANY(r);
7222
7223     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7224
7225     if (flags & RXapif_ALL)
7226         retarray=newAV();
7227
7228     if (rx && RXp_PAREN_NAMES(rx)) {
7229         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7230         if (he_str) {
7231             IV i;
7232             SV* sv_dat=HeVAL(he_str);
7233             I32 *nums=(I32*)SvPVX(sv_dat);
7234             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7235                 if ((I32)(rx->nparens) >= nums[i]
7236                     && rx->offs[nums[i]].start != -1
7237                     && rx->offs[nums[i]].end != -1)
7238                 {
7239                     ret = newSVpvs("");
7240                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7241                     if (!retarray)
7242                         return ret;
7243                 } else {
7244                     if (retarray)
7245                         ret = newSVsv(&PL_sv_undef);
7246                 }
7247                 if (retarray)
7248                     av_push(retarray, ret);
7249             }
7250             if (retarray)
7251                 return newRV_noinc(MUTABLE_SV(retarray));
7252         }
7253     }
7254     return NULL;
7255 }
7256
7257 bool
7258 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7259                            const U32 flags)
7260 {
7261     struct regexp *const rx = ReANY(r);
7262
7263     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7264
7265     if (rx && RXp_PAREN_NAMES(rx)) {
7266         if (flags & RXapif_ALL) {
7267             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7268         } else {
7269             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7270             if (sv) {
7271                 SvREFCNT_dec_NN(sv);
7272                 return TRUE;
7273             } else {
7274                 return FALSE;
7275             }
7276         }
7277     } else {
7278         return FALSE;
7279     }
7280 }
7281
7282 SV*
7283 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7284 {
7285     struct regexp *const rx = ReANY(r);
7286
7287     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7288
7289     if ( rx && RXp_PAREN_NAMES(rx) ) {
7290         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7291
7292         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7293     } else {
7294         return FALSE;
7295     }
7296 }
7297
7298 SV*
7299 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7300 {
7301     struct regexp *const rx = ReANY(r);
7302     GET_RE_DEBUG_FLAGS_DECL;
7303
7304     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7305
7306     if (rx && RXp_PAREN_NAMES(rx)) {
7307         HV *hv = RXp_PAREN_NAMES(rx);
7308         HE *temphe;
7309         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7310             IV i;
7311             IV parno = 0;
7312             SV* sv_dat = HeVAL(temphe);
7313             I32 *nums = (I32*)SvPVX(sv_dat);
7314             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7315                 if ((I32)(rx->lastparen) >= nums[i] &&
7316                     rx->offs[nums[i]].start != -1 &&
7317                     rx->offs[nums[i]].end != -1)
7318                 {
7319                     parno = nums[i];
7320                     break;
7321                 }
7322             }
7323             if (parno || flags & RXapif_ALL) {
7324                 return newSVhek(HeKEY_hek(temphe));
7325             }
7326         }
7327     }
7328     return NULL;
7329 }
7330
7331 SV*
7332 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7333 {
7334     SV *ret;
7335     AV *av;
7336     SSize_t length;
7337     struct regexp *const rx = ReANY(r);
7338
7339     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7340
7341     if (rx && RXp_PAREN_NAMES(rx)) {
7342         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7343             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7344         } else if (flags & RXapif_ONE) {
7345             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7346             av = MUTABLE_AV(SvRV(ret));
7347             length = av_tindex(av);
7348             SvREFCNT_dec_NN(ret);
7349             return newSViv(length + 1);
7350         } else {
7351             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7352                                                 (int)flags);
7353             return NULL;
7354         }
7355     }
7356     return &PL_sv_undef;
7357 }
7358
7359 SV*
7360 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7361 {
7362     struct regexp *const rx = ReANY(r);
7363     AV *av = newAV();
7364
7365     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7366
7367     if (rx && RXp_PAREN_NAMES(rx)) {
7368         HV *hv= RXp_PAREN_NAMES(rx);
7369         HE *temphe;
7370         (void)hv_iterinit(hv);
7371         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7372             IV i;
7373             IV parno = 0;
7374             SV* sv_dat = HeVAL(temphe);
7375             I32 *nums = (I32*)SvPVX(sv_dat);
7376             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7377                 if ((I32)(rx->lastparen) >= nums[i] &&
7378                     rx->offs[nums[i]].start != -1 &&
7379                     rx->offs[nums[i]].end != -1)
7380                 {
7381                     parno = nums[i];
7382                     break;
7383                 }
7384             }
7385             if (parno || flags & RXapif_ALL) {
7386                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7387             }
7388         }
7389     }
7390
7391     return newRV_noinc(MUTABLE_SV(av));
7392 }
7393
7394 void
7395 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7396                              SV * const sv)
7397 {
7398     struct regexp *const rx = ReANY(r);
7399     char *s = NULL;
7400     SSize_t i = 0;
7401     SSize_t s1, t1;
7402     I32 n = paren;
7403
7404     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7405
7406     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7407            || n == RX_BUFF_IDX_CARET_FULLMATCH
7408            || n == RX_BUFF_IDX_CARET_POSTMATCH
7409        )
7410     {
7411         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7412         if (!keepcopy) {
7413             /* on something like
7414              *    $r = qr/.../;
7415              *    /$qr/p;
7416              * the KEEPCOPY is set on the PMOP rather than the regex */
7417             if (PL_curpm && r == PM_GETRE(PL_curpm))
7418                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7419         }
7420         if (!keepcopy)
7421             goto ret_undef;
7422     }
7423
7424     if (!rx->subbeg)
7425         goto ret_undef;
7426
7427     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7428         /* no need to distinguish between them any more */
7429         n = RX_BUFF_IDX_FULLMATCH;
7430
7431     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7432         && rx->offs[0].start != -1)
7433     {
7434         /* $`, ${^PREMATCH} */
7435         i = rx->offs[0].start;
7436         s = rx->subbeg;
7437     }
7438     else
7439     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7440         && rx->offs[0].end != -1)
7441     {
7442         /* $', ${^POSTMATCH} */
7443         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7444         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7445     }
7446     else
7447     if ( 0 <= n && n <= (I32)rx->nparens &&
7448         (s1 = rx->offs[n].start) != -1 &&
7449         (t1 = rx->offs[n].end) != -1)
7450     {
7451         /* $&, ${^MATCH},  $1 ... */
7452         i = t1 - s1;
7453         s = rx->subbeg + s1 - rx->suboffset;
7454     } else {
7455         goto ret_undef;
7456     }
7457
7458     assert(s >= rx->subbeg);
7459     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7460     if (i >= 0) {
7461 #ifdef NO_TAINT_SUPPORT
7462         sv_setpvn(sv, s, i);
7463 #else
7464         const int oldtainted = TAINT_get;
7465         TAINT_NOT;
7466         sv_setpvn(sv, s, i);
7467         TAINT_set(oldtainted);
7468 #endif
7469         if ( (rx->intflags & PREGf_CANY_SEEN)
7470             ? (RXp_MATCH_UTF8(rx)
7471                         && (!i || is_utf8_string((U8*)s, i)))
7472             : (RXp_MATCH_UTF8(rx)) )
7473         {
7474             SvUTF8_on(sv);
7475         }
7476         else
7477             SvUTF8_off(sv);
7478         if (TAINTING_get) {
7479             if (RXp_MATCH_TAINTED(rx)) {
7480                 if (SvTYPE(sv) >= SVt_PVMG) {
7481                     MAGIC* const mg = SvMAGIC(sv);
7482                     MAGIC* mgt;
7483                     TAINT;
7484                     SvMAGIC_set(sv, mg->mg_moremagic);
7485                     SvTAINT(sv);
7486                     if ((mgt = SvMAGIC(sv))) {
7487                         mg->mg_moremagic = mgt;
7488                         SvMAGIC_set(sv, mg);
7489                     }
7490                 } else {
7491                     TAINT;
7492                     SvTAINT(sv);
7493                 }
7494             } else
7495                 SvTAINTED_off(sv);
7496         }
7497     } else {
7498       ret_undef:
7499         sv_setsv(sv,&PL_sv_undef);
7500         return;
7501     }
7502 }
7503
7504 void
7505 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7506                                                          SV const * const value)
7507 {
7508     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7509
7510     PERL_UNUSED_ARG(rx);
7511     PERL_UNUSED_ARG(paren);
7512     PERL_UNUSED_ARG(value);
7513
7514     if (!PL_localizing)
7515         Perl_croak_no_modify();
7516 }
7517
7518 I32
7519 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7520                               const I32 paren)
7521 {
7522     struct regexp *const rx = ReANY(r);
7523     I32 i;
7524     I32 s1, t1;
7525
7526     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7527
7528     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7529         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7530         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7531     )
7532     {
7533         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7534         if (!keepcopy) {
7535             /* on something like
7536              *    $r = qr/.../;
7537              *    /$qr/p;
7538              * the KEEPCOPY is set on the PMOP rather than the regex */
7539             if (PL_curpm && r == PM_GETRE(PL_curpm))
7540                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7541         }
7542         if (!keepcopy)
7543             goto warn_undef;
7544     }
7545
7546     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7547     switch (paren) {
7548       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7549       case RX_BUFF_IDX_PREMATCH:       /* $` */
7550         if (rx->offs[0].start != -1) {
7551                         i = rx->offs[0].start;
7552                         if (i > 0) {
7553                                 s1 = 0;
7554                                 t1 = i;
7555                                 goto getlen;
7556                         }
7557             }
7558         return 0;
7559
7560       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7561       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7562             if (rx->offs[0].end != -1) {
7563                         i = rx->sublen - rx->offs[0].end;
7564                         if (i > 0) {
7565                                 s1 = rx->offs[0].end;
7566                                 t1 = rx->sublen;
7567                                 goto getlen;
7568                         }
7569             }
7570         return 0;
7571
7572       default: /* $& / ${^MATCH}, $1, $2, ... */
7573             if (paren <= (I32)rx->nparens &&
7574             (s1 = rx->offs[paren].start) != -1 &&
7575             (t1 = rx->offs[paren].end) != -1)
7576             {
7577             i = t1 - s1;
7578             goto getlen;
7579         } else {
7580           warn_undef:
7581             if (ckWARN(WARN_UNINITIALIZED))
7582                 report_uninit((const SV *)sv);
7583             return 0;
7584         }
7585     }
7586   getlen:
7587     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7588         const char * const s = rx->subbeg - rx->suboffset + s1;
7589         const U8 *ep;
7590         STRLEN el;
7591
7592         i = t1 - s1;
7593         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7594                         i = el;
7595     }
7596     return i;
7597 }
7598
7599 SV*
7600 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7601 {
7602     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7603         PERL_UNUSED_ARG(rx);
7604         if (0)
7605             return NULL;
7606         else
7607             return newSVpvs("Regexp");
7608 }
7609
7610 /* Scans the name of a named buffer from the pattern.
7611  * If flags is REG_RSN_RETURN_NULL returns null.
7612  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7613  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7614  * to the parsed name as looked up in the RExC_paren_names hash.
7615  * If there is an error throws a vFAIL().. type exception.
7616  */
7617
7618 #define REG_RSN_RETURN_NULL    0
7619 #define REG_RSN_RETURN_NAME    1
7620 #define REG_RSN_RETURN_DATA    2
7621
7622 STATIC SV*
7623 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7624 {
7625     char *name_start = RExC_parse;
7626
7627     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7628
7629     assert (RExC_parse <= RExC_end);
7630     if (RExC_parse == RExC_end) NOOP;
7631     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7632          /* skip IDFIRST by using do...while */
7633         if (UTF)
7634             do {
7635                 RExC_parse += UTF8SKIP(RExC_parse);
7636             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7637         else
7638             do {
7639                 RExC_parse++;
7640             } while (isWORDCHAR(*RExC_parse));
7641     } else {
7642         RExC_parse++; /* so the <- from the vFAIL is after the offending
7643                          character */
7644         vFAIL("Group name must start with a non-digit word character");
7645     }
7646     if ( flags ) {
7647         SV* sv_name
7648             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7649                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7650         if ( flags == REG_RSN_RETURN_NAME)
7651             return sv_name;
7652         else if (flags==REG_RSN_RETURN_DATA) {
7653             HE *he_str = NULL;
7654             SV *sv_dat = NULL;
7655             if ( ! sv_name )      /* should not happen*/
7656                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7657             if (RExC_paren_names)
7658                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7659             if ( he_str )
7660                 sv_dat = HeVAL(he_str);
7661             if ( ! sv_dat )
7662                 vFAIL("Reference to nonexistent named group");
7663             return sv_dat;
7664         }
7665         else {
7666             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7667                        (unsigned long) flags);
7668         }
7669         assert(0); /* NOT REACHED */
7670     }
7671     return NULL;
7672 }
7673
7674 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7675     int rem=(int)(RExC_end - RExC_parse);                       \
7676     int cut;                                                    \
7677     int num;                                                    \
7678     int iscut=0;                                                \
7679     if (rem>10) {                                               \
7680         rem=10;                                                 \
7681         iscut=1;                                                \
7682     }                                                           \
7683     cut=10-rem;                                                 \
7684     if (RExC_lastparse!=RExC_parse)                             \
7685         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7686             rem, RExC_parse,                                    \
7687             cut + 4,                                            \
7688             iscut ? "..." : "<"                                 \
7689         );                                                      \
7690     else                                                        \
7691         PerlIO_printf(Perl_debug_log,"%16s","");                \
7692                                                                 \
7693     if (SIZE_ONLY)                                              \
7694        num = RExC_size + 1;                                     \
7695     else                                                        \
7696        num=REG_NODE_NUM(RExC_emit);                             \
7697     if (RExC_lastnum!=num)                                      \
7698        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7699     else                                                        \
7700        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7701     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7702         (int)((depth*2)), "",                                   \
7703         (funcname)                                              \
7704     );                                                          \
7705     RExC_lastnum=num;                                           \
7706     RExC_lastparse=RExC_parse;                                  \
7707 })
7708
7709
7710
7711 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7712     DEBUG_PARSE_MSG((funcname));                            \
7713     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7714 })
7715 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7716     DEBUG_PARSE_MSG((funcname));                            \
7717     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7718 })
7719
7720 /* This section of code defines the inversion list object and its methods.  The
7721  * interfaces are highly subject to change, so as much as possible is static to
7722  * this file.  An inversion list is here implemented as a malloc'd C UV array
7723  * as an SVt_INVLIST scalar.
7724  *
7725  * An inversion list for Unicode is an array of code points, sorted by ordinal
7726  * number.  The zeroth element is the first code point in the list.  The 1th
7727  * element is the first element beyond that not in the list.  In other words,
7728  * the first range is
7729  *  invlist[0]..(invlist[1]-1)
7730  * The other ranges follow.  Thus every element whose index is divisible by two
7731  * marks the beginning of a range that is in the list, and every element not
7732  * divisible by two marks the beginning of a range not in the list.  A single
7733  * element inversion list that contains the single code point N generally
7734  * consists of two elements
7735  *  invlist[0] == N
7736  *  invlist[1] == N+1
7737  * (The exception is when N is the highest representable value on the
7738  * machine, in which case the list containing just it would be a single
7739  * element, itself.  By extension, if the last range in the list extends to
7740  * infinity, then the first element of that range will be in the inversion list
7741  * at a position that is divisible by two, and is the final element in the
7742  * list.)
7743  * Taking the complement (inverting) an inversion list is quite simple, if the
7744  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7745  * This implementation reserves an element at the beginning of each inversion
7746  * list to always contain 0; there is an additional flag in the header which
7747  * indicates if the list begins at the 0, or is offset to begin at the next
7748  * element.
7749  *
7750  * More about inversion lists can be found in "Unicode Demystified"
7751  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7752  * More will be coming when functionality is added later.
7753  *
7754  * The inversion list data structure is currently implemented as an SV pointing
7755  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7756  * array of UV whose memory management is automatically handled by the existing
7757  * facilities for SV's.
7758  *
7759  * Some of the methods should always be private to the implementation, and some
7760  * should eventually be made public */
7761
7762 /* The header definitions are in F<inline_invlist.c> */
7763
7764 PERL_STATIC_INLINE UV*
7765 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7766 {
7767     /* Returns a pointer to the first element in the inversion list's array.
7768      * This is called upon initialization of an inversion list.  Where the
7769      * array begins depends on whether the list has the code point U+0000 in it
7770      * or not.  The other parameter tells it whether the code that follows this
7771      * call is about to put a 0 in the inversion list or not.  The first
7772      * element is either the element reserved for 0, if TRUE, or the element
7773      * after it, if FALSE */
7774
7775     bool* offset = get_invlist_offset_addr(invlist);
7776     UV* zero_addr = (UV *) SvPVX(invlist);
7777
7778     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7779
7780     /* Must be empty */
7781     assert(! _invlist_len(invlist));
7782
7783     *zero_addr = 0;
7784
7785     /* 1^1 = 0; 1^0 = 1 */
7786     *offset = 1 ^ will_have_0;
7787     return zero_addr + *offset;
7788 }
7789
7790 PERL_STATIC_INLINE UV*
7791 S_invlist_array(pTHX_ SV* const invlist)
7792 {
7793     /* Returns the pointer to the inversion list's array.  Every time the
7794      * length changes, this needs to be called in case malloc or realloc moved
7795      * it */
7796
7797     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7798
7799     /* Must not be empty.  If these fail, you probably didn't check for <len>
7800      * being non-zero before trying to get the array */
7801     assert(_invlist_len(invlist));
7802
7803     /* The very first element always contains zero, The array begins either
7804      * there, or if the inversion list is offset, at the element after it.
7805      * The offset header field determines which; it contains 0 or 1 to indicate
7806      * how much additionally to add */
7807     assert(0 == *(SvPVX(invlist)));
7808     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7809 }
7810
7811 PERL_STATIC_INLINE void
7812 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7813 {
7814     /* Sets the current number of elements stored in the inversion list.
7815      * Updates SvCUR correspondingly */
7816
7817     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7818
7819     assert(SvTYPE(invlist) == SVt_INVLIST);
7820
7821     SvCUR_set(invlist,
7822               (len == 0)
7823                ? 0
7824                : TO_INTERNAL_SIZE(len + offset));
7825     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7826 }
7827
7828 PERL_STATIC_INLINE IV*
7829 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7830 {
7831     /* Return the address of the IV that is reserved to hold the cached index
7832      * */
7833
7834     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7835
7836     assert(SvTYPE(invlist) == SVt_INVLIST);
7837
7838     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7839 }
7840
7841 PERL_STATIC_INLINE IV
7842 S_invlist_previous_index(pTHX_ SV* const invlist)
7843 {
7844     /* Returns cached index of previous search */
7845
7846     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7847
7848     return *get_invlist_previous_index_addr(invlist);
7849 }
7850
7851 PERL_STATIC_INLINE void
7852 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7853 {
7854     /* Caches <index> for later retrieval */
7855
7856     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7857
7858     assert(index == 0 || index < (int) _invlist_len(invlist));
7859
7860     *get_invlist_previous_index_addr(invlist) = index;
7861 }
7862
7863 PERL_STATIC_INLINE UV
7864 S_invlist_max(pTHX_ SV* const invlist)
7865 {
7866     /* Returns the maximum number of elements storable in the inversion list's
7867      * array, without having to realloc() */
7868
7869     PERL_ARGS_ASSERT_INVLIST_MAX;
7870
7871     assert(SvTYPE(invlist) == SVt_INVLIST);
7872
7873     /* Assumes worst case, in which the 0 element is not counted in the
7874      * inversion list, so subtracts 1 for that */
7875     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7876            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7877            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7878 }
7879
7880 #ifndef PERL_IN_XSUB_RE
7881 SV*
7882 Perl__new_invlist(pTHX_ IV initial_size)
7883 {
7884
7885     /* Return a pointer to a newly constructed inversion list, with enough
7886      * space to store 'initial_size' elements.  If that number is negative, a
7887      * system default is used instead */
7888
7889     SV* new_list;
7890
7891     if (initial_size < 0) {
7892         initial_size = 10;
7893     }
7894
7895     /* Allocate the initial space */
7896     new_list = newSV_type(SVt_INVLIST);
7897
7898     /* First 1 is in case the zero element isn't in the list; second 1 is for
7899      * trailing NUL */
7900     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7901     invlist_set_len(new_list, 0, 0);
7902
7903     /* Force iterinit() to be used to get iteration to work */
7904     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7905
7906     *get_invlist_previous_index_addr(new_list) = 0;
7907
7908     return new_list;
7909 }
7910
7911 SV*
7912 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7913 {
7914     /* Return a pointer to a newly constructed inversion list, initialized to
7915      * point to <list>, which has to be in the exact correct inversion list
7916      * form, including internal fields.  Thus this is a dangerous routine that
7917      * should not be used in the wrong hands.  The passed in 'list' contains
7918      * several header fields at the beginning that are not part of the
7919      * inversion list body proper */
7920
7921     const STRLEN length = (STRLEN) list[0];
7922     const UV version_id =          list[1];
7923     const bool offset   =    cBOOL(list[2]);
7924 #define HEADER_LENGTH 3
7925     /* If any of the above changes in any way, you must change HEADER_LENGTH
7926      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7927      *      perl -E 'say int(rand 2**31-1)'
7928      */
7929 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7930                                         data structure type, so that one being
7931                                         passed in can be validated to be an
7932                                         inversion list of the correct vintage.
7933                                        */
7934
7935     SV* invlist = newSV_type(SVt_INVLIST);
7936
7937     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7938
7939     if (version_id != INVLIST_VERSION_ID) {
7940         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7941     }
7942
7943     /* The generated array passed in includes header elements that aren't part
7944      * of the list proper, so start it just after them */
7945     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7946
7947     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7948                                shouldn't touch it */
7949
7950     *(get_invlist_offset_addr(invlist)) = offset;
7951
7952     /* The 'length' passed to us is the physical number of elements in the
7953      * inversion list.  But if there is an offset the logical number is one
7954      * less than that */
7955     invlist_set_len(invlist, length  - offset, offset);
7956
7957     invlist_set_previous_index(invlist, 0);
7958
7959     /* Initialize the iteration pointer. */
7960     invlist_iterfinish(invlist);
7961
7962     SvREADONLY_on(invlist);
7963
7964     return invlist;
7965 }
7966 #endif /* ifndef PERL_IN_XSUB_RE */
7967
7968 STATIC void
7969 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7970 {
7971     /* Grow the maximum size of an inversion list */
7972
7973     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7974
7975     assert(SvTYPE(invlist) == SVt_INVLIST);
7976
7977     /* Add one to account for the zero element at the beginning which may not
7978      * be counted by the calling parameters */
7979     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7980 }
7981
7982 PERL_STATIC_INLINE void
7983 S_invlist_trim(pTHX_ SV* const invlist)
7984 {
7985     PERL_ARGS_ASSERT_INVLIST_TRIM;
7986
7987     assert(SvTYPE(invlist) == SVt_INVLIST);
7988
7989     /* Change the length of the inversion list to how many entries it currently
7990      * has */
7991     SvPV_shrink_to_cur((SV *) invlist);
7992 }
7993
7994 STATIC void
7995 S__append_range_to_invlist(pTHX_ SV* const invlist,
7996                                  const UV start, const UV end)
7997 {
7998    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7999     * the end of the inversion list.  The range must be above any existing
8000     * ones. */
8001
8002     UV* array;
8003     UV max = invlist_max(invlist);
8004     UV len = _invlist_len(invlist);
8005     bool offset;
8006
8007     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8008
8009     if (len == 0) { /* Empty lists must be initialized */
8010         offset = start != 0;
8011         array = _invlist_array_init(invlist, ! offset);
8012     }
8013     else {
8014         /* Here, the existing list is non-empty. The current max entry in the
8015          * list is generally the first value not in the set, except when the
8016          * set extends to the end of permissible values, in which case it is
8017          * the first entry in that final set, and so this call is an attempt to
8018          * append out-of-order */
8019
8020         UV final_element = len - 1;
8021         array = invlist_array(invlist);
8022         if (array[final_element] > start
8023             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8024         {
8025             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8026                      array[final_element], start,
8027                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8028         }
8029
8030         /* Here, it is a legal append.  If the new range begins with the first
8031          * value not in the set, it is extending the set, so the new first
8032          * value not in the set is one greater than the newly extended range.
8033          * */
8034         offset = *get_invlist_offset_addr(invlist);
8035         if (array[final_element] == start) {
8036             if (end != UV_MAX) {
8037                 array[final_element] = end + 1;
8038             }
8039             else {
8040                 /* But if the end is the maximum representable on the machine,
8041                  * just let the range that this would extend to have no end */
8042                 invlist_set_len(invlist, len - 1, offset);
8043             }
8044             return;
8045         }
8046     }
8047
8048     /* Here the new range doesn't extend any existing set.  Add it */
8049
8050     len += 2;   /* Includes an element each for the start and end of range */
8051
8052     /* If wll overflow the existing space, extend, which may cause the array to
8053      * be moved */
8054     if (max < len) {
8055         invlist_extend(invlist, len);
8056
8057         /* Have to set len here to avoid assert failure in invlist_array() */
8058         invlist_set_len(invlist, len, offset);
8059
8060         array = invlist_array(invlist);
8061     }
8062     else {
8063         invlist_set_len(invlist, len, offset);
8064     }
8065
8066     /* The next item on the list starts the range, the one after that is
8067      * one past the new range.  */
8068     array[len - 2] = start;
8069     if (end != UV_MAX) {
8070         array[len - 1] = end + 1;
8071     }
8072     else {
8073         /* But if the end is the maximum representable on the machine, just let
8074          * the range have no end */
8075         invlist_set_len(invlist, len - 1, offset);
8076     }
8077 }
8078
8079 #ifndef PERL_IN_XSUB_RE
8080
8081 IV
8082 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8083 {
8084     /* Searches the inversion list for the entry that contains the input code
8085      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8086      * return value is the index into the list's array of the range that
8087      * contains <cp> */
8088
8089     IV low = 0;
8090     IV mid;
8091     IV high = _invlist_len(invlist);
8092     const IV highest_element = high - 1;
8093     const UV* array;
8094
8095     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8096
8097     /* If list is empty, return failure. */
8098     if (high == 0) {
8099         return -1;
8100     }
8101
8102     /* (We can't get the array unless we know the list is non-empty) */
8103     array = invlist_array(invlist);
8104
8105     mid = invlist_previous_index(invlist);
8106     assert(mid >=0 && mid <= highest_element);
8107
8108     /* <mid> contains the cache of the result of the previous call to this
8109      * function (0 the first time).  See if this call is for the same result,
8110      * or if it is for mid-1.  This is under the theory that calls to this
8111      * function will often be for related code points that are near each other.
8112      * And benchmarks show that caching gives better results.  We also test
8113      * here if the code point is within the bounds of the list.  These tests
8114      * replace others that would have had to be made anyway to make sure that
8115      * the array bounds were not exceeded, and these give us extra information
8116      * at the same time */
8117     if (cp >= array[mid]) {
8118         if (cp >= array[highest_element]) {
8119             return highest_element;
8120         }
8121
8122         /* Here, array[mid] <= cp < array[highest_element].  This means that
8123          * the final element is not the answer, so can exclude it; it also
8124          * means that <mid> is not the final element, so can refer to 'mid + 1'
8125          * safely */
8126         if (cp < array[mid + 1]) {
8127             return mid;
8128         }
8129         high--;
8130         low = mid + 1;
8131     }
8132     else { /* cp < aray[mid] */
8133         if (cp < array[0]) { /* Fail if outside the array */
8134             return -1;
8135         }
8136         high = mid;
8137         if (cp >= array[mid - 1]) {
8138             goto found_entry;
8139         }
8140     }
8141
8142     /* Binary search.  What we are looking for is <i> such that
8143      *  array[i] <= cp < array[i+1]
8144      * The loop below converges on the i+1.  Note that there may not be an
8145      * (i+1)th element in the array, and things work nonetheless */
8146     while (low < high) {
8147         mid = (low + high) / 2;
8148         assert(mid <= highest_element);
8149         if (array[mid] <= cp) { /* cp >= array[mid] */
8150             low = mid + 1;
8151
8152             /* We could do this extra test to exit the loop early.
8153             if (cp < array[low]) {
8154                 return mid;
8155             }
8156             */
8157         }
8158         else { /* cp < array[mid] */
8159             high = mid;
8160         }
8161     }
8162
8163   found_entry:
8164     high--;
8165     invlist_set_previous_index(invlist, high);
8166     return high;
8167 }
8168
8169 void
8170 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8171                                     const UV start, const UV end, U8* swatch)
8172 {
8173     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8174      * but is used when the swash has an inversion list.  This makes this much
8175      * faster, as it uses a binary search instead of a linear one.  This is
8176      * intimately tied to that function, and perhaps should be in utf8.c,
8177      * except it is intimately tied to inversion lists as well.  It assumes
8178      * that <swatch> is all 0's on input */
8179
8180     UV current = start;
8181     const IV len = _invlist_len(invlist);
8182     IV i;
8183     const UV * array;
8184
8185     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8186
8187     if (len == 0) { /* Empty inversion list */
8188         return;
8189     }
8190
8191     array = invlist_array(invlist);
8192
8193     /* Find which element it is */
8194     i = _invlist_search(invlist, start);
8195
8196     /* We populate from <start> to <end> */
8197     while (current < end) {
8198         UV upper;
8199
8200         /* The inversion list gives the results for every possible code point
8201          * after the first one in the list.  Only those ranges whose index is
8202          * even are ones that the inversion list matches.  For the odd ones,
8203          * and if the initial code point is not in the list, we have to skip
8204          * forward to the next element */
8205         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8206             i++;
8207             if (i >= len) { /* Finished if beyond the end of the array */
8208                 return;
8209             }
8210             current = array[i];
8211             if (current >= end) {   /* Finished if beyond the end of what we
8212                                        are populating */
8213                 if (LIKELY(end < UV_MAX)) {
8214                     return;
8215                 }
8216
8217                 /* We get here when the upper bound is the maximum
8218                  * representable on the machine, and we are looking for just
8219                  * that code point.  Have to special case it */
8220                 i = len;
8221                 goto join_end_of_list;
8222             }
8223         }
8224         assert(current >= start);
8225
8226         /* The current range ends one below the next one, except don't go past
8227          * <end> */
8228         i++;
8229         upper = (i < len && array[i] < end) ? array[i] : end;
8230
8231         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8232          * for each code point in it */
8233         for (; current < upper; current++) {
8234             const STRLEN offset = (STRLEN)(current - start);
8235             swatch[offset >> 3] |= 1 << (offset & 7);
8236         }
8237
8238     join_end_of_list:
8239
8240         /* Quit if at the end of the list */
8241         if (i >= len) {
8242
8243             /* But first, have to deal with the highest possible code point on
8244              * the platform.  The previous code assumes that <end> is one
8245              * beyond where we want to populate, but that is impossible at the
8246              * platform's infinity, so have to handle it specially */
8247             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8248             {
8249                 const STRLEN offset = (STRLEN)(end - start);
8250                 swatch[offset >> 3] |= 1 << (offset & 7);
8251             }
8252             return;
8253         }
8254
8255         /* Advance to the next range, which will be for code points not in the
8256          * inversion list */
8257         current = array[i];
8258     }
8259
8260     return;
8261 }
8262
8263 void
8264 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8265                                          const bool complement_b, SV** output)
8266 {
8267     /* Take the union of two inversion lists and point <output> to it.  *output
8268      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8269      * the reference count to that list will be decremented if not already a
8270      * temporary (mortal); otherwise *output will be made correspondingly
8271      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8272      * second list is returned.  If <complement_b> is TRUE, the union is taken
8273      * of the complement (inversion) of <b> instead of b itself.
8274      *
8275      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8276      * Richard Gillam, published by Addison-Wesley, and explained at some
8277      * length there.  The preface says to incorporate its examples into your
8278      * code at your own risk.
8279      *
8280      * The algorithm is like a merge sort.
8281      *
8282      * XXX A potential performance improvement is to keep track as we go along
8283      * if only one of the inputs contributes to the result, meaning the other
8284      * is a subset of that one.  In that case, we can skip the final copy and
8285      * return the larger of the input lists, but then outside code might need
8286      * to keep track of whether to free the input list or not */
8287
8288     const UV* array_a;    /* a's array */
8289     const UV* array_b;
8290     UV len_a;       /* length of a's array */
8291     UV len_b;
8292
8293     SV* u;                      /* the resulting union */
8294     UV* array_u;
8295     UV len_u;
8296
8297     UV i_a = 0;             /* current index into a's array */
8298     UV i_b = 0;
8299     UV i_u = 0;
8300
8301     /* running count, as explained in the algorithm source book; items are
8302      * stopped accumulating and are output when the count changes to/from 0.
8303      * The count is incremented when we start a range that's in the set, and
8304      * decremented when we start a range that's not in the set.  So its range
8305      * is 0 to 2.  Only when the count is zero is something not in the set.
8306      */
8307     UV count = 0;
8308
8309     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8310     assert(a != b);
8311
8312     /* If either one is empty, the union is the other one */
8313     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8314         bool make_temp = FALSE; /* Should we mortalize the result? */
8315
8316         if (*output == a) {
8317             if (a != NULL) {
8318                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8319                     SvREFCNT_dec_NN(a);
8320                 }
8321             }
8322         }
8323         if (*output != b) {
8324             *output = invlist_clone(b);
8325             if (complement_b) {
8326                 _invlist_invert(*output);
8327             }
8328         } /* else *output already = b; */
8329
8330         if (make_temp) {
8331             sv_2mortal(*output);
8332         }
8333         return;
8334     }
8335     else if ((len_b = _invlist_len(b)) == 0) {
8336         bool make_temp = FALSE;
8337         if (*output == b) {
8338             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8339                 SvREFCNT_dec_NN(b);
8340             }
8341         }
8342
8343         /* The complement of an empty list is a list that has everything in it,
8344          * so the union with <a> includes everything too */
8345         if (complement_b) {
8346             if (a == *output) {
8347                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8348                     SvREFCNT_dec_NN(a);
8349                 }
8350             }
8351             *output = _new_invlist(1);
8352             _append_range_to_invlist(*output, 0, UV_MAX);
8353         }
8354         else if (*output != a) {
8355             *output = invlist_clone(a);
8356         }
8357         /* else *output already = a; */
8358
8359         if (make_temp) {
8360             sv_2mortal(*output);
8361         }
8362         return;
8363     }
8364
8365     /* Here both lists exist and are non-empty */
8366     array_a = invlist_array(a);
8367     array_b = invlist_array(b);
8368
8369     /* If are to take the union of 'a' with the complement of b, set it
8370      * up so are looking at b's complement. */
8371     if (complement_b) {
8372
8373         /* To complement, we invert: if the first element is 0, remove it.  To
8374          * do this, we just pretend the array starts one later */
8375         if (array_b[0] == 0) {
8376             array_b++;
8377             len_b--;
8378         }
8379         else {
8380
8381             /* But if the first element is not zero, we pretend the list starts
8382              * at the 0 that is always stored immediately before the array. */
8383             array_b--;
8384             len_b++;
8385         }
8386     }
8387
8388     /* Size the union for the worst case: that the sets are completely
8389      * disjoint */
8390     u = _new_invlist(len_a + len_b);
8391
8392     /* Will contain U+0000 if either component does */
8393     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8394                                       || (len_b > 0 && array_b[0] == 0));
8395
8396     /* Go through each list item by item, stopping when exhausted one of
8397      * them */
8398     while (i_a < len_a && i_b < len_b) {
8399         UV cp;      /* The element to potentially add to the union's array */
8400         bool cp_in_set;   /* is it in the the input list's set or not */
8401
8402         /* We need to take one or the other of the two inputs for the union.
8403          * Since we are merging two sorted lists, we take the smaller of the
8404          * next items.  In case of a tie, we take the one that is in its set
8405          * first.  If we took one not in the set first, it would decrement the
8406          * count, possibly to 0 which would cause it to be output as ending the
8407          * range, and the next time through we would take the same number, and
8408          * output it again as beginning the next range.  By doing it the
8409          * opposite way, there is no possibility that the count will be
8410          * momentarily decremented to 0, and thus the two adjoining ranges will
8411          * be seamlessly merged.  (In a tie and both are in the set or both not
8412          * in the set, it doesn't matter which we take first.) */
8413         if (array_a[i_a] < array_b[i_b]
8414             || (array_a[i_a] == array_b[i_b]
8415                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8416         {
8417             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8418             cp= array_a[i_a++];
8419         }
8420         else {
8421             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8422             cp = array_b[i_b++];
8423         }
8424
8425         /* Here, have chosen which of the two inputs to look at.  Only output
8426          * if the running count changes to/from 0, which marks the
8427          * beginning/end of a range in that's in the set */
8428         if (cp_in_set) {
8429             if (count == 0) {
8430                 array_u[i_u++] = cp;
8431             }
8432             count++;
8433         }
8434         else {
8435             count--;
8436             if (count == 0) {
8437                 array_u[i_u++] = cp;
8438             }
8439         }
8440     }
8441
8442     /* Here, we are finished going through at least one of the lists, which
8443      * means there is something remaining in at most one.  We check if the list
8444      * that hasn't been exhausted is positioned such that we are in the middle
8445      * of a range in its set or not.  (i_a and i_b point to the element beyond
8446      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8447      * is potentially more to output.
8448      * There are four cases:
8449      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8450      *     in the union is entirely from the non-exhausted set.
8451      *  2) Both were in their sets, count is 2.  Nothing further should
8452      *     be output, as everything that remains will be in the exhausted
8453      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8454      *     that
8455      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8456      *     Nothing further should be output because the union includes
8457      *     everything from the exhausted set.  Not decrementing ensures that.
8458      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8459      *     decrementing to 0 insures that we look at the remainder of the
8460      *     non-exhausted set */
8461     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8462         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8463     {
8464         count--;
8465     }
8466
8467     /* The final length is what we've output so far, plus what else is about to
8468      * be output.  (If 'count' is non-zero, then the input list we exhausted
8469      * has everything remaining up to the machine's limit in its set, and hence
8470      * in the union, so there will be no further output. */
8471     len_u = i_u;
8472     if (count == 0) {
8473         /* At most one of the subexpressions will be non-zero */
8474         len_u += (len_a - i_a) + (len_b - i_b);
8475     }
8476
8477     /* Set result to final length, which can change the pointer to array_u, so
8478      * re-find it */
8479     if (len_u != _invlist_len(u)) {
8480         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8481         invlist_trim(u);
8482         array_u = invlist_array(u);
8483     }
8484
8485     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8486      * the other) ended with everything above it not in its set.  That means
8487      * that the remaining part of the union is precisely the same as the
8488      * non-exhausted list, so can just copy it unchanged.  (If both list were
8489      * exhausted at the same time, then the operations below will be both 0.)
8490      */
8491     if (count == 0) {
8492         IV copy_count; /* At most one will have a non-zero copy count */
8493         if ((copy_count = len_a - i_a) > 0) {
8494             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8495         }
8496         else if ((copy_count = len_b - i_b) > 0) {
8497             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8498         }
8499     }
8500
8501     /*  We may be removing a reference to one of the inputs.  If so, the output
8502      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8503      *  count decremented) */
8504     if (a == *output || b == *output) {
8505         assert(! invlist_is_iterating(*output));
8506         if ((SvTEMP(*output))) {
8507             sv_2mortal(u);
8508         }
8509         else {
8510             SvREFCNT_dec_NN(*output);
8511         }
8512     }
8513
8514     *output = u;
8515
8516     return;
8517 }
8518
8519 void
8520 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8521                                                const bool complement_b, SV** i)
8522 {
8523     /* Take the intersection of two inversion lists and point <i> to it.  *i
8524      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8525      * the reference count to that list will be decremented if not already a
8526      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8527      * The first list, <a>, may be NULL, in which case an empty list is
8528      * returned.  If <complement_b> is TRUE, the result will be the
8529      * intersection of <a> and the complement (or inversion) of <b> instead of
8530      * <b> directly.
8531      *
8532      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8533      * Richard Gillam, published by Addison-Wesley, and explained at some
8534      * length there.  The preface says to incorporate its examples into your
8535      * code at your own risk.  In fact, it had bugs
8536      *
8537      * The algorithm is like a merge sort, and is essentially the same as the
8538      * union above
8539      */
8540
8541     const UV* array_a;          /* a's array */
8542     const UV* array_b;
8543     UV len_a;   /* length of a's array */
8544     UV len_b;
8545
8546     SV* r;                   /* the resulting intersection */
8547     UV* array_r;
8548     UV len_r;
8549
8550     UV i_a = 0;             /* current index into a's array */
8551     UV i_b = 0;
8552     UV i_r = 0;
8553
8554     /* running count, as explained in the algorithm source book; items are
8555      * stopped accumulating and are output when the count changes to/from 2.
8556      * The count is incremented when we start a range that's in the set, and
8557      * decremented when we start a range that's not in the set.  So its range
8558      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8559      */
8560     UV count = 0;
8561
8562     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8563     assert(a != b);
8564
8565     /* Special case if either one is empty */
8566     len_a = (a == NULL) ? 0 : _invlist_len(a);
8567     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8568         bool make_temp = FALSE;
8569
8570         if (len_a != 0 && complement_b) {
8571
8572             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8573              * be empty.  Here, also we are using 'b's complement, which hence
8574              * must be every possible code point.  Thus the intersection is
8575              * simply 'a'. */
8576             if (*i != a) {
8577                 if (*i == b) {
8578                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8579                         SvREFCNT_dec_NN(b);
8580                     }
8581                 }
8582
8583                 *i = invlist_clone(a);
8584             }
8585             /* else *i is already 'a' */
8586
8587             if (make_temp) {
8588                 sv_2mortal(*i);
8589             }
8590             return;
8591         }
8592
8593         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8594          * intersection must be empty */
8595         if (*i == a) {
8596             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8597                 SvREFCNT_dec_NN(a);
8598             }
8599         }
8600         else if (*i == b) {
8601             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8602                 SvREFCNT_dec_NN(b);
8603             }
8604         }
8605         *i = _new_invlist(0);
8606         if (make_temp) {
8607             sv_2mortal(*i);
8608         }
8609
8610         return;
8611     }
8612
8613     /* Here both lists exist and are non-empty */
8614     array_a = invlist_array(a);
8615     array_b = invlist_array(b);
8616
8617     /* If are to take the intersection of 'a' with the complement of b, set it
8618      * up so are looking at b's complement. */
8619     if (complement_b) {
8620
8621         /* To complement, we invert: if the first element is 0, remove it.  To
8622          * do this, we just pretend the array starts one later */
8623         if (array_b[0] == 0) {
8624             array_b++;
8625             len_b--;
8626         }
8627         else {
8628
8629             /* But if the first element is not zero, we pretend the list starts
8630              * at the 0 that is always stored immediately before the array. */
8631             array_b--;
8632             len_b++;
8633         }
8634     }
8635
8636     /* Size the intersection for the worst case: that the intersection ends up
8637      * fragmenting everything to be completely disjoint */
8638     r= _new_invlist(len_a + len_b);
8639
8640     /* Will contain U+0000 iff both components do */
8641     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8642                                      && len_b > 0 && array_b[0] == 0);
8643
8644     /* Go through each list item by item, stopping when exhausted one of
8645      * them */
8646     while (i_a < len_a && i_b < len_b) {
8647         UV cp;      /* The element to potentially add to the intersection's
8648                        array */
8649         bool cp_in_set; /* Is it in the input list's set or not */
8650
8651         /* We need to take one or the other of the two inputs for the
8652          * intersection.  Since we are merging two sorted lists, we take the
8653          * smaller of the next items.  In case of a tie, we take the one that
8654          * is not in its set first (a difference from the union algorithm).  If
8655          * we took one in the set first, it would increment the count, possibly
8656          * to 2 which would cause it to be output as starting a range in the
8657          * intersection, and the next time through we would take that same
8658          * number, and output it again as ending the set.  By doing it the
8659          * opposite of this, there is no possibility that the count will be
8660          * momentarily incremented to 2.  (In a tie and both are in the set or
8661          * both not in the set, it doesn't matter which we take first.) */
8662         if (array_a[i_a] < array_b[i_b]
8663             || (array_a[i_a] == array_b[i_b]
8664                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8665         {
8666             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8667             cp= array_a[i_a++];
8668         }
8669         else {
8670             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8671             cp= array_b[i_b++];
8672         }
8673
8674         /* Here, have chosen which of the two inputs to look at.  Only output
8675          * if the running count changes to/from 2, which marks the
8676          * beginning/end of a range that's in the intersection */
8677         if (cp_in_set) {
8678             count++;
8679             if (count == 2) {
8680                 array_r[i_r++] = cp;
8681             }
8682         }
8683         else {
8684             if (count == 2) {
8685                 array_r[i_r++] = cp;
8686             }
8687             count--;
8688         }
8689     }
8690
8691     /* Here, we are finished going through at least one of the lists, which
8692      * means there is something remaining in at most one.  We check if the list
8693      * that has been exhausted is positioned such that we are in the middle
8694      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8695      * the ones we care about.)  There are four cases:
8696      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8697      *     nothing left in the intersection.
8698      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8699      *     above 2.  What should be output is exactly that which is in the
8700      *     non-exhausted set, as everything it has is also in the intersection
8701      *     set, and everything it doesn't have can't be in the intersection
8702      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8703      *     gets incremented to 2.  Like the previous case, the intersection is
8704      *     everything that remains in the non-exhausted set.
8705      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8706      *     remains 1.  And the intersection has nothing more. */
8707     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8708         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8709     {
8710         count++;
8711     }
8712
8713     /* The final length is what we've output so far plus what else is in the
8714      * intersection.  At most one of the subexpressions below will be non-zero
8715      * */
8716     len_r = i_r;
8717     if (count >= 2) {
8718         len_r += (len_a - i_a) + (len_b - i_b);
8719     }
8720
8721     /* Set result to final length, which can change the pointer to array_r, so
8722      * re-find it */
8723     if (len_r != _invlist_len(r)) {
8724         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8725         invlist_trim(r);
8726         array_r = invlist_array(r);
8727     }
8728
8729     /* Finish outputting any remaining */
8730     if (count >= 2) { /* At most one will have a non-zero copy count */
8731         IV copy_count;
8732         if ((copy_count = len_a - i_a) > 0) {
8733             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8734         }
8735         else if ((copy_count = len_b - i_b) > 0) {
8736             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8737         }
8738     }
8739
8740     /*  We may be removing a reference to one of the inputs.  If so, the output
8741      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8742      *  count decremented) */
8743     if (a == *i || b == *i) {
8744         assert(! invlist_is_iterating(*i));
8745         if (SvTEMP(*i)) {
8746             sv_2mortal(r);
8747         }
8748         else {
8749             SvREFCNT_dec_NN(*i);
8750         }
8751     }
8752
8753     *i = r;
8754
8755     return;
8756 }
8757
8758 SV*
8759 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8760 {
8761     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8762      * set.  A pointer to the inversion list is returned.  This may actually be
8763      * a new list, in which case the passed in one has been destroyed.  The
8764      * passed in inversion list can be NULL, in which case a new one is created
8765      * with just the one range in it */
8766
8767     SV* range_invlist;
8768     UV len;
8769
8770     if (invlist == NULL) {
8771         invlist = _new_invlist(2);
8772         len = 0;
8773     }
8774     else {
8775         len = _invlist_len(invlist);
8776     }
8777
8778     /* If comes after the final entry actually in the list, can just append it
8779      * to the end, */
8780     if (len == 0
8781         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8782             && start >= invlist_array(invlist)[len - 1]))
8783     {
8784         _append_range_to_invlist(invlist, start, end);
8785         return invlist;
8786     }
8787
8788     /* Here, can't just append things, create and return a new inversion list
8789      * which is the union of this range and the existing inversion list */
8790     range_invlist = _new_invlist(2);
8791     _append_range_to_invlist(range_invlist, start, end);
8792
8793     _invlist_union(invlist, range_invlist, &invlist);
8794
8795     /* The temporary can be freed */
8796     SvREFCNT_dec_NN(range_invlist);
8797
8798     return invlist;
8799 }
8800
8801 SV*
8802 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8803                                  UV** other_elements_ptr)
8804 {
8805     /* Create and return an inversion list whose contents are to be populated
8806      * by the caller.  The caller gives the number of elements (in 'size') and
8807      * the very first element ('element0').  This function will set
8808      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8809      * are to be placed.
8810      *
8811      * Obviously there is some trust involved that the caller will properly
8812      * fill in the other elements of the array.
8813      *
8814      * (The first element needs to be passed in, as the underlying code does
8815      * things differently depending on whether it is zero or non-zero) */
8816
8817     SV* invlist = _new_invlist(size);
8818     bool offset;
8819
8820     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8821
8822     _append_range_to_invlist(invlist, element0, element0);
8823     offset = *get_invlist_offset_addr(invlist);
8824
8825     invlist_set_len(invlist, size, offset);
8826     *other_elements_ptr = invlist_array(invlist) + 1;
8827     return invlist;
8828 }
8829
8830 #endif
8831
8832 PERL_STATIC_INLINE SV*
8833 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8834     return _add_range_to_invlist(invlist, cp, cp);
8835 }
8836
8837 #ifndef PERL_IN_XSUB_RE
8838 void
8839 Perl__invlist_invert(pTHX_ SV* const invlist)
8840 {
8841     /* Complement the input inversion list.  This adds a 0 if the list didn't
8842      * have a zero; removes it otherwise.  As described above, the data
8843      * structure is set up so that this is very efficient */
8844
8845     PERL_ARGS_ASSERT__INVLIST_INVERT;
8846
8847     assert(! invlist_is_iterating(invlist));
8848
8849     /* The inverse of matching nothing is matching everything */
8850     if (_invlist_len(invlist) == 0) {
8851         _append_range_to_invlist(invlist, 0, UV_MAX);
8852         return;
8853     }
8854
8855     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8856 }
8857
8858 #endif
8859
8860 PERL_STATIC_INLINE SV*
8861 S_invlist_clone(pTHX_ SV* const invlist)
8862 {
8863
8864     /* Return a new inversion list that is a copy of the input one, which is
8865      * unchanged.  The new list will not be mortal even if the old one was. */
8866
8867     /* Need to allocate extra space to accommodate Perl's addition of a
8868      * trailing NUL to SvPV's, since it thinks they are always strings */
8869     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8870     STRLEN physical_length = SvCUR(invlist);
8871     bool offset = *(get_invlist_offset_addr(invlist));
8872
8873     PERL_ARGS_ASSERT_INVLIST_CLONE;
8874
8875     *(get_invlist_offset_addr(new_invlist)) = offset;
8876     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8877     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8878
8879     return new_invlist;
8880 }
8881
8882 PERL_STATIC_INLINE STRLEN*
8883 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8884 {
8885     /* Return the address of the UV that contains the current iteration
8886      * position */
8887
8888     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8889
8890     assert(SvTYPE(invlist) == SVt_INVLIST);
8891
8892     return &(((XINVLIST*) SvANY(invlist))->iterator);
8893 }
8894
8895 PERL_STATIC_INLINE void
8896 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8897 {
8898     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8899
8900     *get_invlist_iter_addr(invlist) = 0;
8901 }
8902
8903 PERL_STATIC_INLINE void
8904 S_invlist_iterfinish(pTHX_ SV* invlist)
8905 {
8906     /* Terminate iterator for invlist.  This is to catch development errors.
8907      * Any iteration that is interrupted before completed should call this
8908      * function.  Functions that add code points anywhere else but to the end
8909      * of an inversion list assert that they are not in the middle of an
8910      * iteration.  If they were, the addition would make the iteration
8911      * problematical: if the iteration hadn't reached the place where things
8912      * were being added, it would be ok */
8913
8914     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8915
8916     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8917 }
8918
8919 STATIC bool
8920 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8921 {
8922     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8923      * This call sets in <*start> and <*end>, the next range in <invlist>.
8924      * Returns <TRUE> if successful and the next call will return the next
8925      * range; <FALSE> if was already at the end of the list.  If the latter,
8926      * <*start> and <*end> are unchanged, and the next call to this function
8927      * will start over at the beginning of the list */
8928
8929     STRLEN* pos = get_invlist_iter_addr(invlist);
8930     UV len = _invlist_len(invlist);
8931     UV *array;
8932
8933     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8934
8935     if (*pos >= len) {
8936         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8937         return FALSE;
8938     }
8939
8940     array = invlist_array(invlist);
8941
8942     *start = array[(*pos)++];
8943
8944     if (*pos >= len) {
8945         *end = UV_MAX;
8946     }
8947     else {
8948         *end = array[(*pos)++] - 1;
8949     }
8950
8951     return TRUE;
8952 }
8953
8954 PERL_STATIC_INLINE bool
8955 S_invlist_is_iterating(pTHX_ SV* const invlist)
8956 {
8957     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8958
8959     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8960 }
8961
8962 PERL_STATIC_INLINE UV
8963 S_invlist_highest(pTHX_ SV* const invlist)
8964 {
8965     /* Returns the highest code point that matches an inversion list.  This API
8966      * has an ambiguity, as it returns 0 under either the highest is actually
8967      * 0, or if the list is empty.  If this distinction matters to you, check
8968      * for emptiness before calling this function */
8969
8970     UV len = _invlist_len(invlist);
8971     UV *array;
8972
8973     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8974
8975     if (len == 0) {
8976         return 0;
8977     }
8978
8979     array = invlist_array(invlist);
8980
8981     /* The last element in the array in the inversion list always starts a
8982      * range that goes to infinity.  That range may be for code points that are
8983      * matched in the inversion list, or it may be for ones that aren't
8984      * matched.  In the latter case, the highest code point in the set is one
8985      * less than the beginning of this range; otherwise it is the final element
8986      * of this range: infinity */
8987     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8988            ? UV_MAX
8989            : array[len - 1] - 1;
8990 }
8991
8992 #ifndef PERL_IN_XSUB_RE
8993 SV *
8994 Perl__invlist_contents(pTHX_ SV* const invlist)
8995 {
8996     /* Get the contents of an inversion list into a string SV so that they can
8997      * be printed out.  It uses the format traditionally done for debug tracing
8998      */
8999
9000     UV start, end;
9001     SV* output = newSVpvs("\n");
9002
9003     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9004
9005     assert(! invlist_is_iterating(invlist));
9006
9007     invlist_iterinit(invlist);
9008     while (invlist_iternext(invlist, &start, &end)) {
9009         if (end == UV_MAX) {
9010             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9011         }
9012         else if (end != start) {
9013             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9014                     start,       end);
9015         }
9016         else {
9017             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9018         }
9019     }
9020
9021     return output;
9022 }
9023 #endif
9024
9025 #ifndef PERL_IN_XSUB_RE
9026 void
9027 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9028                          const char * const indent, SV* const invlist)
9029 {
9030     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9031      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9032      * the string 'indent'.  The output looks like this:
9033          [0] 0x000A .. 0x000D
9034          [2] 0x0085
9035          [4] 0x2028 .. 0x2029
9036          [6] 0x3104 .. INFINITY
9037      * This means that the first range of code points matched by the list are
9038      * 0xA through 0xD; the second range contains only the single code point
9039      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9040      * are used to define each range (except if the final range extends to
9041      * infinity, only a single element is needed).  The array index of the
9042      * first element for the corresponding range is given in brackets. */
9043
9044     UV start, end;
9045     STRLEN count = 0;
9046
9047     PERL_ARGS_ASSERT__INVLIST_DUMP;
9048
9049     if (invlist_is_iterating(invlist)) {
9050         Perl_dump_indent(aTHX_ level, file,
9051              "%sCan't dump inversion list because is in middle of iterating\n",
9052              indent);
9053         return;
9054     }
9055
9056     invlist_iterinit(invlist);
9057     while (invlist_iternext(invlist, &start, &end)) {
9058         if (end == UV_MAX) {
9059             Perl_dump_indent(aTHX_ level, file,
9060                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9061                                    indent, (UV)count, start);
9062         }
9063         else if (end != start) {
9064             Perl_dump_indent(aTHX_ level, file,
9065                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9066                                 indent, (UV)count, start,         end);
9067         }
9068         else {
9069             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9070                                             indent, (UV)count, start);
9071         }
9072         count += 2;
9073     }
9074 }
9075 #endif
9076
9077 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9078 bool
9079 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9080 {
9081     /* Return a boolean as to if the two passed in inversion lists are
9082      * identical.  The final argument, if TRUE, says to take the complement of
9083      * the second inversion list before doing the comparison */
9084
9085     const UV* array_a = invlist_array(a);
9086     const UV* array_b = invlist_array(b);
9087     UV len_a = _invlist_len(a);
9088     UV len_b = _invlist_len(b);
9089
9090     UV i = 0;               /* current index into the arrays */
9091     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9092
9093     PERL_ARGS_ASSERT__INVLISTEQ;
9094
9095     /* If are to compare 'a' with the complement of b, set it
9096      * up so are looking at b's complement. */
9097     if (complement_b) {
9098
9099         /* The complement of nothing is everything, so <a> would have to have
9100          * just one element, starting at zero (ending at infinity) */
9101         if (len_b == 0) {
9102             return (len_a == 1 && array_a[0] == 0);
9103         }
9104         else if (array_b[0] == 0) {
9105
9106             /* Otherwise, to complement, we invert.  Here, the first element is
9107              * 0, just remove it.  To do this, we just pretend the array starts
9108              * one later */
9109
9110             array_b++;
9111             len_b--;
9112         }
9113         else {
9114
9115             /* But if the first element is not zero, we pretend the list starts
9116              * at the 0 that is always stored immediately before the array. */
9117             array_b--;
9118             len_b++;
9119         }
9120     }
9121
9122     /* Make sure that the lengths are the same, as well as the final element
9123      * before looping through the remainder.  (Thus we test the length, final,
9124      * and first elements right off the bat) */
9125     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9126         retval = FALSE;
9127     }
9128     else for (i = 0; i < len_a - 1; i++) {
9129         if (array_a[i] != array_b[i]) {
9130             retval = FALSE;
9131             break;
9132         }
9133     }
9134
9135     return retval;
9136 }
9137 #endif
9138
9139 #undef HEADER_LENGTH
9140 #undef TO_INTERNAL_SIZE
9141 #undef FROM_INTERNAL_SIZE
9142 #undef INVLIST_VERSION_ID
9143
9144 /* End of inversion list object */
9145
9146 STATIC void
9147 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9148 {
9149     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9150      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9151      * should point to the first flag; it is updated on output to point to the
9152      * final ')' or ':'.  There needs to be at least one flag, or this will
9153      * abort */
9154
9155     /* for (?g), (?gc), and (?o) warnings; warning
9156        about (?c) will warn about (?g) -- japhy    */
9157
9158 #define WASTED_O  0x01
9159 #define WASTED_G  0x02
9160 #define WASTED_C  0x04
9161 #define WASTED_GC (WASTED_G|WASTED_C)
9162     I32 wastedflags = 0x00;
9163     U32 posflags = 0, negflags = 0;
9164     U32 *flagsp = &posflags;
9165     char has_charset_modifier = '\0';
9166     regex_charset cs;
9167     bool has_use_defaults = FALSE;
9168     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9169
9170     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9171
9172     /* '^' as an initial flag sets certain defaults */
9173     if (UCHARAT(RExC_parse) == '^') {
9174         RExC_parse++;
9175         has_use_defaults = TRUE;
9176         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9177         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9178                                         ? REGEX_UNICODE_CHARSET
9179                                         : REGEX_DEPENDS_CHARSET);
9180     }
9181
9182     cs = get_regex_charset(RExC_flags);
9183     if (cs == REGEX_DEPENDS_CHARSET
9184         && (RExC_utf8 || RExC_uni_semantics))
9185     {
9186         cs = REGEX_UNICODE_CHARSET;
9187     }
9188
9189     while (*RExC_parse) {
9190         /* && strchr("iogcmsx", *RExC_parse) */
9191         /* (?g), (?gc) and (?o) are useless here
9192            and must be globally applied -- japhy */
9193         switch (*RExC_parse) {
9194
9195             /* Code for the imsx flags */
9196             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9197
9198             case LOCALE_PAT_MOD:
9199                 if (has_charset_modifier) {
9200                     goto excess_modifier;
9201                 }
9202                 else if (flagsp == &negflags) {
9203                     goto neg_modifier;
9204                 }
9205                 cs = REGEX_LOCALE_CHARSET;
9206                 has_charset_modifier = LOCALE_PAT_MOD;
9207                 break;
9208             case UNICODE_PAT_MOD:
9209                 if (has_charset_modifier) {
9210                     goto excess_modifier;
9211                 }
9212                 else if (flagsp == &negflags) {
9213                     goto neg_modifier;
9214                 }
9215                 cs = REGEX_UNICODE_CHARSET;
9216                 has_charset_modifier = UNICODE_PAT_MOD;
9217                 break;
9218             case ASCII_RESTRICT_PAT_MOD:
9219                 if (flagsp == &negflags) {
9220                     goto neg_modifier;
9221                 }
9222                 if (has_charset_modifier) {
9223                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9224                         goto excess_modifier;
9225                     }
9226                     /* Doubled modifier implies more restricted */
9227                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9228                 }
9229                 else {
9230                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9231                 }
9232                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9233                 break;
9234             case DEPENDS_PAT_MOD:
9235                 if (has_use_defaults) {
9236                     goto fail_modifiers;
9237                 }
9238                 else if (flagsp == &negflags) {
9239                     goto neg_modifier;
9240                 }
9241                 else if (has_charset_modifier) {
9242                     goto excess_modifier;
9243                 }
9244
9245                 /* The dual charset means unicode semantics if the
9246                  * pattern (or target, not known until runtime) are
9247                  * utf8, or something in the pattern indicates unicode
9248                  * semantics */
9249                 cs = (RExC_utf8 || RExC_uni_semantics)
9250                      ? REGEX_UNICODE_CHARSET
9251                      : REGEX_DEPENDS_CHARSET;
9252                 has_charset_modifier = DEPENDS_PAT_MOD;
9253                 break;
9254             excess_modifier:
9255                 RExC_parse++;
9256                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9257                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9258                 }
9259                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9260                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9261                                         *(RExC_parse - 1));
9262                 }
9263                 else {
9264                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9265                 }
9266                 /*NOTREACHED*/
9267             neg_modifier:
9268                 RExC_parse++;
9269                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9270                                     *(RExC_parse - 1));
9271                 /*NOTREACHED*/
9272             case ONCE_PAT_MOD: /* 'o' */
9273             case GLOBAL_PAT_MOD: /* 'g' */
9274                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9275                     const I32 wflagbit = *RExC_parse == 'o'
9276                                          ? WASTED_O
9277                                          : WASTED_G;
9278                     if (! (wastedflags & wflagbit) ) {
9279                         wastedflags |= wflagbit;
9280                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9281                         vWARN5(
9282                             RExC_parse + 1,
9283                             "Useless (%s%c) - %suse /%c modifier",
9284                             flagsp == &negflags ? "?-" : "?",
9285                             *RExC_parse,
9286                             flagsp == &negflags ? "don't " : "",
9287                             *RExC_parse
9288                         );
9289                     }
9290                 }
9291                 break;
9292
9293             case CONTINUE_PAT_MOD: /* 'c' */
9294                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9295                     if (! (wastedflags & WASTED_C) ) {
9296                         wastedflags |= WASTED_GC;
9297                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9298                         vWARN3(
9299                             RExC_parse + 1,
9300                             "Useless (%sc) - %suse /gc modifier",
9301                             flagsp == &negflags ? "?-" : "?",
9302                             flagsp == &negflags ? "don't " : ""
9303                         );
9304                     }
9305                 }
9306                 break;
9307             case KEEPCOPY_PAT_MOD: /* 'p' */
9308                 if (flagsp == &negflags) {
9309                     if (SIZE_ONLY)
9310                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9311                 } else {
9312                     *flagsp |= RXf_PMf_KEEPCOPY;
9313                 }
9314                 break;
9315             case '-':
9316                 /* A flag is a default iff it is following a minus, so
9317                  * if there is a minus, it means will be trying to
9318                  * re-specify a default which is an error */
9319                 if (has_use_defaults || flagsp == &negflags) {
9320                     goto fail_modifiers;
9321                 }
9322                 flagsp = &negflags;
9323                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9324                 break;
9325             case ':':
9326             case ')':
9327                 RExC_flags |= posflags;
9328                 RExC_flags &= ~negflags;
9329                 set_regex_charset(&RExC_flags, cs);
9330                 if (RExC_flags & RXf_PMf_FOLD) {
9331                     RExC_contains_i = 1;
9332                 }
9333                 return;
9334                 /*NOTREACHED*/
9335             default:
9336             fail_modifiers:
9337                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9338                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9339                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9340                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9341                 /*NOTREACHED*/
9342         }
9343
9344         ++RExC_parse;
9345     }
9346 }
9347
9348 /*
9349  - reg - regular expression, i.e. main body or parenthesized thing
9350  *
9351  * Caller must absorb opening parenthesis.
9352  *
9353  * Combining parenthesis handling with the base level of regular expression
9354  * is a trifle forced, but the need to tie the tails of the branches to what
9355  * follows makes it hard to avoid.
9356  */
9357 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9358 #ifdef DEBUGGING
9359 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9360 #else
9361 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9362 #endif
9363
9364 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9365    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9366    needs to be restarted.
9367    Otherwise would only return NULL if regbranch() returns NULL, which
9368    cannot happen.  */
9369 STATIC regnode *
9370 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9371     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9372      * 2 is like 1, but indicates that nextchar() has been called to advance
9373      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9374      * this flag alerts us to the need to check for that */
9375 {
9376     dVAR;
9377     regnode *ret;               /* Will be the head of the group. */
9378     regnode *br;
9379     regnode *lastbr;
9380     regnode *ender = NULL;
9381     I32 parno = 0;
9382     I32 flags;
9383     U32 oregflags = RExC_flags;
9384     bool have_branch = 0;
9385     bool is_open = 0;
9386     I32 freeze_paren = 0;
9387     I32 after_freeze = 0;
9388
9389     char * parse_start = RExC_parse; /* MJD */
9390     char * const oregcomp_parse = RExC_parse;
9391
9392     GET_RE_DEBUG_FLAGS_DECL;
9393
9394     PERL_ARGS_ASSERT_REG;
9395     DEBUG_PARSE("reg ");
9396
9397     *flagp = 0;                         /* Tentatively. */
9398
9399
9400     /* Make an OPEN node, if parenthesized. */
9401     if (paren) {
9402
9403         /* Under /x, space and comments can be gobbled up between the '(' and
9404          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9405          * intervening space, as the sequence is a token, and a token should be
9406          * indivisible */
9407         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9408
9409         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9410             char *start_verb = RExC_parse;
9411             STRLEN verb_len = 0;
9412             char *start_arg = NULL;
9413             unsigned char op = 0;
9414             int argok = 1;
9415             int internal_argval = 0; /* internal_argval is only useful if
9416                                         !argok */
9417
9418             if (has_intervening_patws && SIZE_ONLY) {
9419                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9420             }
9421             while ( *RExC_parse && *RExC_parse != ')' ) {
9422                 if ( *RExC_parse == ':' ) {
9423                     start_arg = RExC_parse + 1;
9424                     break;
9425                 }
9426                 RExC_parse++;
9427             }
9428             ++start_verb;
9429             verb_len = RExC_parse - start_verb;
9430             if ( start_arg ) {
9431                 RExC_parse++;
9432                 while ( *RExC_parse && *RExC_parse != ')' )
9433                     RExC_parse++;
9434                 if ( *RExC_parse != ')' )
9435                     vFAIL("Unterminated verb pattern argument");
9436                 if ( RExC_parse == start_arg )
9437                     start_arg = NULL;
9438             } else {
9439                 if ( *RExC_parse != ')' )
9440                     vFAIL("Unterminated verb pattern");
9441             }
9442
9443             switch ( *start_verb ) {
9444             case 'A':  /* (*ACCEPT) */
9445                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9446                     op = ACCEPT;
9447                     internal_argval = RExC_nestroot;
9448                 }
9449                 break;
9450             case 'C':  /* (*COMMIT) */
9451                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9452                     op = COMMIT;
9453                 break;
9454             case 'F':  /* (*FAIL) */
9455                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9456                     op = OPFAIL;
9457                     argok = 0;
9458                 }
9459                 break;
9460             case ':':  /* (*:NAME) */
9461             case 'M':  /* (*MARK:NAME) */
9462                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9463                     op = MARKPOINT;
9464                     argok = -1;
9465                 }
9466                 break;
9467             case 'P':  /* (*PRUNE) */
9468                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9469                     op = PRUNE;
9470                 break;
9471             case 'S':   /* (*SKIP) */
9472                 if ( memEQs(start_verb,verb_len,"SKIP") )
9473                     op = SKIP;
9474                 break;
9475             case 'T':  /* (*THEN) */
9476                 /* [19:06] <TimToady> :: is then */
9477                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9478                     op = CUTGROUP;
9479                     RExC_seen |= REG_CUTGROUP_SEEN;
9480                 }
9481                 break;
9482             }
9483             if ( ! op ) {
9484                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9485                 vFAIL2utf8f(
9486                     "Unknown verb pattern '%"UTF8f"'",
9487                     UTF8fARG(UTF, verb_len, start_verb));
9488             }
9489             if ( argok ) {
9490                 if ( start_arg && internal_argval ) {
9491                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9492                         verb_len, start_verb);
9493                 } else if ( argok < 0 && !start_arg ) {
9494                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9495                         verb_len, start_verb);
9496                 } else {
9497                     ret = reganode(pRExC_state, op, internal_argval);
9498                     if ( ! internal_argval && ! SIZE_ONLY ) {
9499                         if (start_arg) {
9500                             SV *sv = newSVpvn( start_arg,
9501                                                RExC_parse - start_arg);
9502                             ARG(ret) = add_data( pRExC_state,
9503                                                  STR_WITH_LEN("S"));
9504                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9505                             ret->flags = 0;
9506                         } else {
9507                             ret->flags = 1;
9508                         }
9509                     }
9510                 }
9511                 if (!internal_argval)
9512                     RExC_seen |= REG_VERBARG_SEEN;
9513             } else if ( start_arg ) {
9514                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9515                         verb_len, start_verb);
9516             } else {
9517                 ret = reg_node(pRExC_state, op);
9518             }
9519             nextchar(pRExC_state);
9520             return ret;
9521         }
9522         else if (*RExC_parse == '?') { /* (?...) */
9523             bool is_logical = 0;
9524             const char * const seqstart = RExC_parse;
9525             if (has_intervening_patws && SIZE_ONLY) {
9526                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9527             }
9528
9529             RExC_parse++;
9530             paren = *RExC_parse++;
9531             ret = NULL;                 /* For look-ahead/behind. */
9532             switch (paren) {
9533
9534             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9535                 paren = *RExC_parse++;
9536                 if ( paren == '<')         /* (?P<...>) named capture */
9537                     goto named_capture;
9538                 else if (paren == '>') {   /* (?P>name) named recursion */
9539                     goto named_recursion;
9540                 }
9541                 else if (paren == '=') {   /* (?P=...)  named backref */
9542                     /* this pretty much dupes the code for \k<NAME> in
9543                      * regatom(), if you change this make sure you change that
9544                      * */
9545                     char* name_start = RExC_parse;
9546                     U32 num = 0;
9547                     SV *sv_dat = reg_scan_name(pRExC_state,
9548                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9549                     if (RExC_parse == name_start || *RExC_parse != ')')
9550                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9551                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9552
9553                     if (!SIZE_ONLY) {
9554                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9555                         RExC_rxi->data->data[num]=(void*)sv_dat;
9556                         SvREFCNT_inc_simple_void(sv_dat);
9557                     }
9558                     RExC_sawback = 1;
9559                     ret = reganode(pRExC_state,
9560                                    ((! FOLD)
9561                                      ? NREF
9562                                      : (ASCII_FOLD_RESTRICTED)
9563                                        ? NREFFA
9564                                        : (AT_LEAST_UNI_SEMANTICS)
9565                                          ? NREFFU
9566                                          : (LOC)
9567                                            ? NREFFL
9568                                            : NREFF),
9569                                     num);
9570                     *flagp |= HASWIDTH;
9571
9572                     Set_Node_Offset(ret, parse_start+1);
9573                     Set_Node_Cur_Length(ret, parse_start);
9574
9575                     nextchar(pRExC_state);
9576                     return ret;
9577                 }
9578                 RExC_parse++;
9579                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9580                 vFAIL3("Sequence (%.*s...) not recognized",
9581                                 RExC_parse-seqstart, seqstart);
9582                 /*NOTREACHED*/
9583             case '<':           /* (?<...) */
9584                 if (*RExC_parse == '!')
9585                     paren = ',';
9586                 else if (*RExC_parse != '=')
9587               named_capture:
9588                 {               /* (?<...>) */
9589                     char *name_start;
9590                     SV *svname;
9591                     paren= '>';
9592             case '\'':          /* (?'...') */
9593                     name_start= RExC_parse;
9594                     svname = reg_scan_name(pRExC_state,
9595                         SIZE_ONLY    /* reverse test from the others */
9596                         ? REG_RSN_RETURN_NAME
9597                         : REG_RSN_RETURN_NULL);
9598                     if (RExC_parse == name_start || *RExC_parse != paren)
9599                         vFAIL2("Sequence (?%c... not terminated",
9600                             paren=='>' ? '<' : paren);
9601                     if (SIZE_ONLY) {
9602                         HE *he_str;
9603                         SV *sv_dat = NULL;
9604                         if (!svname) /* shouldn't happen */
9605                             Perl_croak(aTHX_
9606                                 "panic: reg_scan_name returned NULL");
9607                         if (!RExC_paren_names) {
9608                             RExC_paren_names= newHV();
9609                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9610 #ifdef DEBUGGING
9611                             RExC_paren_name_list= newAV();
9612                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9613 #endif
9614                         }
9615                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9616                         if ( he_str )
9617                             sv_dat = HeVAL(he_str);
9618                         if ( ! sv_dat ) {
9619                             /* croak baby croak */
9620                             Perl_croak(aTHX_
9621                                 "panic: paren_name hash element allocation failed");
9622                         } else if ( SvPOK(sv_dat) ) {
9623                             /* (?|...) can mean we have dupes so scan to check
9624                                its already been stored. Maybe a flag indicating
9625                                we are inside such a construct would be useful,
9626                                but the arrays are likely to be quite small, so
9627                                for now we punt -- dmq */
9628                             IV count = SvIV(sv_dat);
9629                             I32 *pv = (I32*)SvPVX(sv_dat);
9630                             IV i;
9631                             for ( i = 0 ; i < count ; i++ ) {
9632                                 if ( pv[i] == RExC_npar ) {
9633                                     count = 0;
9634                                     break;
9635                                 }
9636                             }
9637                             if ( count ) {
9638                                 pv = (I32*)SvGROW(sv_dat,
9639                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9640                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9641                                 pv[count] = RExC_npar;
9642                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9643                             }
9644                         } else {
9645                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9646                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9647                                                                 sizeof(I32));
9648                             SvIOK_on(sv_dat);
9649                             SvIV_set(sv_dat, 1);
9650                         }
9651 #ifdef DEBUGGING
9652                         /* Yes this does cause a memory leak in debugging Perls
9653                          * */
9654                         if (!av_store(RExC_paren_name_list,
9655                                       RExC_npar, SvREFCNT_inc(svname)))
9656                             SvREFCNT_dec_NN(svname);
9657 #endif
9658
9659                         /*sv_dump(sv_dat);*/
9660                     }
9661                     nextchar(pRExC_state);
9662                     paren = 1;
9663                     goto capturing_parens;
9664                 }
9665                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9666                 RExC_in_lookbehind++;
9667                 RExC_parse++;
9668             case '=':           /* (?=...) */
9669                 RExC_seen_zerolen++;
9670                 break;
9671             case '!':           /* (?!...) */
9672                 RExC_seen_zerolen++;
9673                 if (*RExC_parse == ')') {
9674                     ret=reg_node(pRExC_state, OPFAIL);
9675                     nextchar(pRExC_state);
9676                     return ret;
9677                 }
9678                 break;
9679             case '|':           /* (?|...) */
9680                 /* branch reset, behave like a (?:...) except that
9681                    buffers in alternations share the same numbers */
9682                 paren = ':';
9683                 after_freeze = freeze_paren = RExC_npar;
9684                 break;
9685             case ':':           /* (?:...) */
9686             case '>':           /* (?>...) */
9687                 break;
9688             case '$':           /* (?$...) */
9689             case '@':           /* (?@...) */
9690                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9691                 break;
9692             case '#':           /* (?#...) */
9693                 /* XXX As soon as we disallow separating the '?' and '*' (by
9694                  * spaces or (?#...) comment), it is believed that this case
9695                  * will be unreachable and can be removed.  See
9696                  * [perl #117327] */
9697                 while (*RExC_parse && *RExC_parse != ')')
9698                     RExC_parse++;
9699                 if (*RExC_parse != ')')
9700                     FAIL("Sequence (?#... not terminated");
9701                 nextchar(pRExC_state);
9702                 *flagp = TRYAGAIN;
9703                 return NULL;
9704             case '0' :           /* (?0) */
9705             case 'R' :           /* (?R) */
9706                 if (*RExC_parse != ')')
9707                     FAIL("Sequence (?R) not terminated");
9708                 ret = reg_node(pRExC_state, GOSTART);
9709                     RExC_seen |= REG_GOSTART_SEEN;
9710                 *flagp |= POSTPONED;
9711                 nextchar(pRExC_state);
9712                 return ret;
9713                 /*notreached*/
9714             { /* named and numeric backreferences */
9715                 I32 num;
9716             case '&':            /* (?&NAME) */
9717                 parse_start = RExC_parse - 1;
9718               named_recursion:
9719                 {
9720                     SV *sv_dat = reg_scan_name(pRExC_state,
9721                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9722                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9723                 }
9724                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9725                     vFAIL("Sequence (?&... not terminated");
9726                 goto gen_recurse_regop;
9727                 assert(0); /* NOT REACHED */
9728             case '+':
9729                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9730                     RExC_parse++;
9731                     vFAIL("Illegal pattern");
9732                 }
9733                 goto parse_recursion;
9734                 /* NOT REACHED*/
9735             case '-': /* (?-1) */
9736                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9737                     RExC_parse--; /* rewind to let it be handled later */
9738                     goto parse_flags;
9739                 }
9740                 /*FALLTHROUGH */
9741             case '1': case '2': case '3': case '4': /* (?1) */
9742             case '5': case '6': case '7': case '8': case '9':
9743                 RExC_parse--;
9744               parse_recursion:
9745                 num = atoi(RExC_parse);
9746                 parse_start = RExC_parse - 1; /* MJD */
9747                 if (*RExC_parse == '-')
9748                     RExC_parse++;
9749                 while (isDIGIT(*RExC_parse))
9750                         RExC_parse++;
9751                 if (*RExC_parse!=')')
9752                     vFAIL("Expecting close bracket");
9753
9754               gen_recurse_regop:
9755                 if ( paren == '-' ) {
9756                     /*
9757                     Diagram of capture buffer numbering.
9758                     Top line is the normal capture buffer numbers
9759                     Bottom line is the negative indexing as from
9760                     the X (the (?-2))
9761
9762                     +   1 2    3 4 5 X          6 7
9763                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9764                     -   5 4    3 2 1 X          x x
9765
9766                     */
9767                     num = RExC_npar + num;
9768                     if (num < 1)  {
9769                         RExC_parse++;
9770                         vFAIL("Reference to nonexistent group");
9771                     }
9772                 } else if ( paren == '+' ) {
9773                     num = RExC_npar + num - 1;
9774                 }
9775
9776                 ret = reganode(pRExC_state, GOSUB, num);
9777                 if (!SIZE_ONLY) {
9778                     if (num > (I32)RExC_rx->nparens) {
9779                         RExC_parse++;
9780                         vFAIL("Reference to nonexistent group");
9781                     }
9782                     ARG2L_SET( ret, RExC_recurse_count++);
9783                     RExC_emit++;
9784                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9785                         "Recurse #%"UVuf" to %"IVdf"\n",
9786                               (UV)ARG(ret), (IV)ARG2L(ret)));
9787                 } else {
9788                     RExC_size++;
9789                 }
9790                     RExC_seen |= REG_RECURSE_SEEN;
9791                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9792                 Set_Node_Offset(ret, parse_start); /* MJD */
9793
9794                 *flagp |= POSTPONED;
9795                 nextchar(pRExC_state);
9796                 return ret;
9797             } /* named and numeric backreferences */
9798             assert(0); /* NOT REACHED */
9799
9800             case '?':           /* (??...) */
9801                 is_logical = 1;
9802                 if (*RExC_parse != '{') {
9803                     RExC_parse++;
9804                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9805                     vFAIL2utf8f(
9806                         "Sequence (%"UTF8f"...) not recognized",
9807                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9808                     /*NOTREACHED*/
9809                 }
9810                 *flagp |= POSTPONED;
9811                 paren = *RExC_parse++;
9812                 /* FALL THROUGH */
9813             case '{':           /* (?{...}) */
9814             {
9815                 U32 n = 0;
9816                 struct reg_code_block *cb;
9817
9818                 RExC_seen_zerolen++;
9819
9820                 if (   !pRExC_state->num_code_blocks
9821                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9822                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9823                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9824                             - RExC_start)
9825                 ) {
9826                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9827                         FAIL("panic: Sequence (?{...}): no code block found\n");
9828                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9829                 }
9830                 /* this is a pre-compiled code block (?{...}) */
9831                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9832                 RExC_parse = RExC_start + cb->end;
9833                 if (!SIZE_ONLY) {
9834                     OP *o = cb->block;
9835                     if (cb->src_regex) {
9836                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9837                         RExC_rxi->data->data[n] =
9838                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9839                         RExC_rxi->data->data[n+1] = (void*)o;
9840                     }
9841                     else {
9842                         n = add_data(pRExC_state,
9843                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9844                         RExC_rxi->data->data[n] = (void*)o;
9845                     }
9846                 }
9847                 pRExC_state->code_index++;
9848                 nextchar(pRExC_state);
9849
9850                 if (is_logical) {
9851                     regnode *eval;
9852                     ret = reg_node(pRExC_state, LOGICAL);
9853                     eval = reganode(pRExC_state, EVAL, n);
9854                     if (!SIZE_ONLY) {
9855                         ret->flags = 2;
9856                         /* for later propagation into (??{}) return value */
9857                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9858                     }
9859                     REGTAIL(pRExC_state, ret, eval);
9860                     /* deal with the length of this later - MJD */
9861                     return ret;
9862                 }
9863                 ret = reganode(pRExC_state, EVAL, n);
9864                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9865                 Set_Node_Offset(ret, parse_start);
9866                 return ret;
9867             }
9868             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9869             {
9870                 int is_define= 0;
9871                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9872                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9873                         || RExC_parse[1] == '<'
9874                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9875                         I32 flag;
9876                         regnode *tail;
9877
9878                         ret = reg_node(pRExC_state, LOGICAL);
9879                         if (!SIZE_ONLY)
9880                             ret->flags = 1;
9881
9882                         tail = reg(pRExC_state, 1, &flag, depth+1);
9883                         if (flag & RESTART_UTF8) {
9884                             *flagp = RESTART_UTF8;
9885                             return NULL;
9886                         }
9887                         REGTAIL(pRExC_state, ret, tail);
9888                         goto insert_if;
9889                     }
9890                 }
9891                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9892                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9893                 {
9894                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9895                     char *name_start= RExC_parse++;
9896                     U32 num = 0;
9897                     SV *sv_dat=reg_scan_name(pRExC_state,
9898                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9899                     if (RExC_parse == name_start || *RExC_parse != ch)
9900                         vFAIL2("Sequence (?(%c... not terminated",
9901                             (ch == '>' ? '<' : ch));
9902                     RExC_parse++;
9903                     if (!SIZE_ONLY) {
9904                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9905                         RExC_rxi->data->data[num]=(void*)sv_dat;
9906                         SvREFCNT_inc_simple_void(sv_dat);
9907                     }
9908                     ret = reganode(pRExC_state,NGROUPP,num);
9909                     goto insert_if_check_paren;
9910                 }
9911                 else if (RExC_parse[0] == 'D' &&
9912                          RExC_parse[1] == 'E' &&
9913                          RExC_parse[2] == 'F' &&
9914                          RExC_parse[3] == 'I' &&
9915                          RExC_parse[4] == 'N' &&
9916                          RExC_parse[5] == 'E')
9917                 {
9918                     ret = reganode(pRExC_state,DEFINEP,0);
9919                     RExC_parse +=6 ;
9920                     is_define = 1;
9921                     goto insert_if_check_paren;
9922                 }
9923                 else if (RExC_parse[0] == 'R') {
9924                     RExC_parse++;
9925                     parno = 0;
9926                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9927                         parno = atoi(RExC_parse++);
9928                         while (isDIGIT(*RExC_parse))
9929                             RExC_parse++;
9930                     } else if (RExC_parse[0] == '&') {
9931                         SV *sv_dat;
9932                         RExC_parse++;
9933                         sv_dat = reg_scan_name(pRExC_state,
9934                             SIZE_ONLY
9935                             ? REG_RSN_RETURN_NULL
9936                             : REG_RSN_RETURN_DATA);
9937                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9938                     }
9939                     ret = reganode(pRExC_state,INSUBP,parno);
9940                     goto insert_if_check_paren;
9941                 }
9942                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9943                     /* (?(1)...) */
9944                     char c;
9945                     char *tmp;
9946                     parno = atoi(RExC_parse++);
9947
9948                     while (isDIGIT(*RExC_parse))
9949                         RExC_parse++;
9950                     ret = reganode(pRExC_state, GROUPP, parno);
9951
9952                  insert_if_check_paren:
9953                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9954                         /* nextchar also skips comments, so undo its work
9955                          * and skip over the the next character.
9956                          */
9957                         RExC_parse = tmp;
9958                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9959                         vFAIL("Switch condition not recognized");
9960                     }
9961                   insert_if:
9962                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9963                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9964                     if (br == NULL) {
9965                         if (flags & RESTART_UTF8) {
9966                             *flagp = RESTART_UTF8;
9967                             return NULL;
9968                         }
9969                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9970                               (UV) flags);
9971                     } else
9972                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
9973                                                           LONGJMP, 0));
9974                     c = *nextchar(pRExC_state);
9975                     if (flags&HASWIDTH)
9976                         *flagp |= HASWIDTH;
9977                     if (c == '|') {
9978                         if (is_define)
9979                             vFAIL("(?(DEFINE)....) does not allow branches");
9980
9981                         /* Fake one for optimizer.  */
9982                         lastbr = reganode(pRExC_state, IFTHEN, 0);
9983
9984                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9985                             if (flags & RESTART_UTF8) {
9986                                 *flagp = RESTART_UTF8;
9987                                 return NULL;
9988                             }
9989                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9990                                   (UV) flags);
9991                         }
9992                         REGTAIL(pRExC_state, ret, lastbr);
9993                         if (flags&HASWIDTH)
9994                             *flagp |= HASWIDTH;
9995                         c = *nextchar(pRExC_state);
9996                     }
9997                     else
9998                         lastbr = NULL;
9999                     if (c != ')')
10000                         vFAIL("Switch (?(condition)... contains too many branches");
10001                     ender = reg_node(pRExC_state, TAIL);
10002                     REGTAIL(pRExC_state, br, ender);
10003                     if (lastbr) {
10004                         REGTAIL(pRExC_state, lastbr, ender);
10005                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10006                     }
10007                     else
10008                         REGTAIL(pRExC_state, ret, ender);
10009                     RExC_size++; /* XXX WHY do we need this?!!
10010                                     For large programs it seems to be required
10011                                     but I can't figure out why. -- dmq*/
10012                     return ret;
10013                 }
10014                 else {
10015                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10016                     vFAIL("Unknown switch condition (?(...))");
10017                 }
10018             }
10019             case '[':           /* (?[ ... ]) */
10020                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10021                                          oregcomp_parse);
10022             case 0:
10023                 RExC_parse--; /* for vFAIL to print correctly */
10024                 vFAIL("Sequence (? incomplete");
10025                 break;
10026             default: /* e.g., (?i) */
10027                 --RExC_parse;
10028               parse_flags:
10029                 parse_lparen_question_flags(pRExC_state);
10030                 if (UCHARAT(RExC_parse) != ':') {
10031                     nextchar(pRExC_state);
10032                     *flagp = TRYAGAIN;
10033                     return NULL;
10034                 }
10035                 paren = ':';
10036                 nextchar(pRExC_state);
10037                 ret = NULL;
10038                 goto parse_rest;
10039             } /* end switch */
10040         }
10041         else {                  /* (...) */
10042           capturing_parens:
10043             parno = RExC_npar;
10044             RExC_npar++;
10045
10046             ret = reganode(pRExC_state, OPEN, parno);
10047             if (!SIZE_ONLY ){
10048                 if (!RExC_nestroot)
10049                     RExC_nestroot = parno;
10050                 if (RExC_seen & REG_RECURSE_SEEN
10051                     && !RExC_open_parens[parno-1])
10052                 {
10053                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10054                         "Setting open paren #%"IVdf" to %d\n",
10055                         (IV)parno, REG_NODE_NUM(ret)));
10056                     RExC_open_parens[parno-1]= ret;
10057                 }
10058             }
10059             Set_Node_Length(ret, 1); /* MJD */
10060             Set_Node_Offset(ret, RExC_parse); /* MJD */
10061             is_open = 1;
10062         }
10063     }
10064     else                        /* ! paren */
10065         ret = NULL;
10066
10067    parse_rest:
10068     /* Pick up the branches, linking them together. */
10069     parse_start = RExC_parse;   /* MJD */
10070     br = regbranch(pRExC_state, &flags, 1,depth+1);
10071
10072     /*     branch_len = (paren != 0); */
10073
10074     if (br == NULL) {
10075         if (flags & RESTART_UTF8) {
10076             *flagp = RESTART_UTF8;
10077             return NULL;
10078         }
10079         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10080     }
10081     if (*RExC_parse == '|') {
10082         if (!SIZE_ONLY && RExC_extralen) {
10083             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10084         }
10085         else {                  /* MJD */
10086             reginsert(pRExC_state, BRANCH, br, depth+1);
10087             Set_Node_Length(br, paren != 0);
10088             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10089         }
10090         have_branch = 1;
10091         if (SIZE_ONLY)
10092             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10093     }
10094     else if (paren == ':') {
10095         *flagp |= flags&SIMPLE;
10096     }
10097     if (is_open) {                              /* Starts with OPEN. */
10098         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10099     }
10100     else if (paren != '?')              /* Not Conditional */
10101         ret = br;
10102     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10103     lastbr = br;
10104     while (*RExC_parse == '|') {
10105         if (!SIZE_ONLY && RExC_extralen) {
10106             ender = reganode(pRExC_state, LONGJMP,0);
10107
10108             /* Append to the previous. */
10109             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10110         }
10111         if (SIZE_ONLY)
10112             RExC_extralen += 2;         /* Account for LONGJMP. */
10113         nextchar(pRExC_state);
10114         if (freeze_paren) {
10115             if (RExC_npar > after_freeze)
10116                 after_freeze = RExC_npar;
10117             RExC_npar = freeze_paren;
10118         }
10119         br = regbranch(pRExC_state, &flags, 0, depth+1);
10120
10121         if (br == NULL) {
10122             if (flags & RESTART_UTF8) {
10123                 *flagp = RESTART_UTF8;
10124                 return NULL;
10125             }
10126             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10127         }
10128         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10129         lastbr = br;
10130         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10131     }
10132
10133     if (have_branch || paren != ':') {
10134         /* Make a closing node, and hook it on the end. */
10135         switch (paren) {
10136         case ':':
10137             ender = reg_node(pRExC_state, TAIL);
10138             break;
10139         case 1: case 2:
10140             ender = reganode(pRExC_state, CLOSE, parno);
10141             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10142                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10143                         "Setting close paren #%"IVdf" to %d\n",
10144                         (IV)parno, REG_NODE_NUM(ender)));
10145                 RExC_close_parens[parno-1]= ender;
10146                 if (RExC_nestroot == parno)
10147                     RExC_nestroot = 0;
10148             }
10149             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10150             Set_Node_Length(ender,1); /* MJD */
10151             break;
10152         case '<':
10153         case ',':
10154         case '=':
10155         case '!':
10156             *flagp &= ~HASWIDTH;
10157             /* FALL THROUGH */
10158         case '>':
10159             ender = reg_node(pRExC_state, SUCCEED);
10160             break;
10161         case 0:
10162             ender = reg_node(pRExC_state, END);
10163             if (!SIZE_ONLY) {
10164                 assert(!RExC_opend); /* there can only be one! */
10165                 RExC_opend = ender;
10166             }
10167             break;
10168         }
10169         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10170             SV * const mysv_val1=sv_newmortal();
10171             SV * const mysv_val2=sv_newmortal();
10172             DEBUG_PARSE_MSG("lsbr");
10173             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10174             regprop(RExC_rx, mysv_val2, ender, NULL);
10175             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10176                           SvPV_nolen_const(mysv_val1),
10177                           (IV)REG_NODE_NUM(lastbr),
10178                           SvPV_nolen_const(mysv_val2),
10179                           (IV)REG_NODE_NUM(ender),
10180                           (IV)(ender - lastbr)
10181             );
10182         });
10183         REGTAIL(pRExC_state, lastbr, ender);
10184
10185         if (have_branch && !SIZE_ONLY) {
10186             char is_nothing= 1;
10187             if (depth==1)
10188                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10189
10190             /* Hook the tails of the branches to the closing node. */
10191             for (br = ret; br; br = regnext(br)) {
10192                 const U8 op = PL_regkind[OP(br)];
10193                 if (op == BRANCH) {
10194                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10195                     if ( OP(NEXTOPER(br)) != NOTHING
10196                          || regnext(NEXTOPER(br)) != ender)
10197                         is_nothing= 0;
10198                 }
10199                 else if (op == BRANCHJ) {
10200                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10201                     /* for now we always disable this optimisation * /
10202                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10203                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10204                     */
10205                         is_nothing= 0;
10206                 }
10207             }
10208             if (is_nothing) {
10209                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10210                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10211                     SV * const mysv_val1=sv_newmortal();
10212                     SV * const mysv_val2=sv_newmortal();
10213                     DEBUG_PARSE_MSG("NADA");
10214                     regprop(RExC_rx, mysv_val1, ret, NULL);
10215                     regprop(RExC_rx, mysv_val2, ender, NULL);
10216                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10217                                   SvPV_nolen_const(mysv_val1),
10218                                   (IV)REG_NODE_NUM(ret),
10219                                   SvPV_nolen_const(mysv_val2),
10220                                   (IV)REG_NODE_NUM(ender),
10221                                   (IV)(ender - ret)
10222                     );
10223                 });
10224                 OP(br)= NOTHING;
10225                 if (OP(ender) == TAIL) {
10226                     NEXT_OFF(br)= 0;
10227                     RExC_emit= br + 1;
10228                 } else {
10229                     regnode *opt;
10230                     for ( opt= br + 1; opt < ender ; opt++ )
10231                         OP(opt)= OPTIMIZED;
10232                     NEXT_OFF(br)= ender - br;
10233                 }
10234             }
10235         }
10236     }
10237
10238     {
10239         const char *p;
10240         static const char parens[] = "=!<,>";
10241
10242         if (paren && (p = strchr(parens, paren))) {
10243             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10244             int flag = (p - parens) > 1;
10245
10246             if (paren == '>')
10247                 node = SUSPEND, flag = 0;
10248             reginsert(pRExC_state, node,ret, depth+1);
10249             Set_Node_Cur_Length(ret, parse_start);
10250             Set_Node_Offset(ret, parse_start + 1);
10251             ret->flags = flag;
10252             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10253         }
10254     }
10255
10256     /* Check for proper termination. */
10257     if (paren) {
10258         /* restore original flags, but keep (?p) */
10259         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10260         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10261             RExC_parse = oregcomp_parse;
10262             vFAIL("Unmatched (");
10263         }
10264     }
10265     else if (!paren && RExC_parse < RExC_end) {
10266         if (*RExC_parse == ')') {
10267             RExC_parse++;
10268             vFAIL("Unmatched )");
10269         }
10270         else
10271             FAIL("Junk on end of regexp");      /* "Can't happen". */
10272         assert(0); /* NOTREACHED */
10273     }
10274
10275     if (RExC_in_lookbehind) {
10276         RExC_in_lookbehind--;
10277     }
10278     if (after_freeze > RExC_npar)
10279         RExC_npar = after_freeze;
10280     return(ret);
10281 }
10282
10283 /*
10284  - regbranch - one alternative of an | operator
10285  *
10286  * Implements the concatenation operator.
10287  *
10288  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10289  * restarted.
10290  */
10291 STATIC regnode *
10292 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10293 {
10294     dVAR;
10295     regnode *ret;
10296     regnode *chain = NULL;
10297     regnode *latest;
10298     I32 flags = 0, c = 0;
10299     GET_RE_DEBUG_FLAGS_DECL;
10300
10301     PERL_ARGS_ASSERT_REGBRANCH;
10302
10303     DEBUG_PARSE("brnc");
10304
10305     if (first)
10306         ret = NULL;
10307     else {
10308         if (!SIZE_ONLY && RExC_extralen)
10309             ret = reganode(pRExC_state, BRANCHJ,0);
10310         else {
10311             ret = reg_node(pRExC_state, BRANCH);
10312             Set_Node_Length(ret, 1);
10313         }
10314     }
10315
10316     if (!first && SIZE_ONLY)
10317         RExC_extralen += 1;                     /* BRANCHJ */
10318
10319     *flagp = WORST;                     /* Tentatively. */
10320
10321     RExC_parse--;
10322     nextchar(pRExC_state);
10323     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10324         flags &= ~TRYAGAIN;
10325         latest = regpiece(pRExC_state, &flags,depth+1);
10326         if (latest == NULL) {
10327             if (flags & TRYAGAIN)
10328                 continue;
10329             if (flags & RESTART_UTF8) {
10330                 *flagp = RESTART_UTF8;
10331                 return NULL;
10332             }
10333             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10334         }
10335         else if (ret == NULL)
10336             ret = latest;
10337         *flagp |= flags&(HASWIDTH|POSTPONED);
10338         if (chain == NULL)      /* First piece. */
10339             *flagp |= flags&SPSTART;
10340         else {
10341             RExC_naughty++;
10342             REGTAIL(pRExC_state, chain, latest);
10343         }
10344         chain = latest;
10345         c++;
10346     }
10347     if (chain == NULL) {        /* Loop ran zero times. */
10348         chain = reg_node(pRExC_state, NOTHING);
10349         if (ret == NULL)
10350             ret = chain;
10351     }
10352     if (c == 1) {
10353         *flagp |= flags&SIMPLE;
10354     }
10355
10356     return ret;
10357 }
10358
10359 /*
10360  - regpiece - something followed by possible [*+?]
10361  *
10362  * Note that the branching code sequences used for ? and the general cases
10363  * of * and + are somewhat optimized:  they use the same NOTHING node as
10364  * both the endmarker for their branch list and the body of the last branch.
10365  * It might seem that this node could be dispensed with entirely, but the
10366  * endmarker role is not redundant.
10367  *
10368  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10369  * TRYAGAIN.
10370  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10371  * restarted.
10372  */
10373 STATIC regnode *
10374 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10375 {
10376     dVAR;
10377     regnode *ret;
10378     char op;
10379     char *next;
10380     I32 flags;
10381     const char * const origparse = RExC_parse;
10382     I32 min;
10383     I32 max = REG_INFTY;
10384 #ifdef RE_TRACK_PATTERN_OFFSETS
10385     char *parse_start;
10386 #endif
10387     const char *maxpos = NULL;
10388
10389     /* Save the original in case we change the emitted regop to a FAIL. */
10390     regnode * const orig_emit = RExC_emit;
10391
10392     GET_RE_DEBUG_FLAGS_DECL;
10393
10394     PERL_ARGS_ASSERT_REGPIECE;
10395
10396     DEBUG_PARSE("piec");
10397
10398     ret = regatom(pRExC_state, &flags,depth+1);
10399     if (ret == NULL) {
10400         if (flags & (TRYAGAIN|RESTART_UTF8))
10401             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10402         else
10403             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10404         return(NULL);
10405     }
10406
10407     op = *RExC_parse;
10408
10409     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10410         maxpos = NULL;
10411 #ifdef RE_TRACK_PATTERN_OFFSETS
10412         parse_start = RExC_parse; /* MJD */
10413 #endif
10414         next = RExC_parse + 1;
10415         while (isDIGIT(*next) || *next == ',') {
10416             if (*next == ',') {
10417                 if (maxpos)
10418                     break;
10419                 else
10420                     maxpos = next;
10421             }
10422             next++;
10423         }
10424         if (*next == '}') {             /* got one */
10425             if (!maxpos)
10426                 maxpos = next;
10427             RExC_parse++;
10428             min = atoi(RExC_parse);
10429             if (*maxpos == ',')
10430                 maxpos++;
10431             else
10432                 maxpos = RExC_parse;
10433             max = atoi(maxpos);
10434             if (!max && *maxpos != '0')
10435                 max = REG_INFTY;                /* meaning "infinity" */
10436             else if (max >= REG_INFTY)
10437                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10438             RExC_parse = next;
10439             nextchar(pRExC_state);
10440             if (max < min) {    /* If can't match, warn and optimize to fail
10441                                    unconditionally */
10442                 if (SIZE_ONLY) {
10443                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10444
10445                     /* We can't back off the size because we have to reserve
10446                      * enough space for all the things we are about to throw
10447                      * away, but we can shrink it by the ammount we are about
10448                      * to re-use here */
10449                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10450                 }
10451                 else {
10452                     RExC_emit = orig_emit;
10453                 }
10454                 ret = reg_node(pRExC_state, OPFAIL);
10455                 return ret;
10456             }
10457             else if (min == max
10458                      && RExC_parse < RExC_end
10459                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10460             {
10461                 if (SIZE_ONLY) {
10462                     ckWARN2reg(RExC_parse + 1,
10463                                "Useless use of greediness modifier '%c'",
10464                                *RExC_parse);
10465                 }
10466                 /* Absorb the modifier, so later code doesn't see nor use
10467                     * it */
10468                 nextchar(pRExC_state);
10469             }
10470
10471         do_curly:
10472             if ((flags&SIMPLE)) {
10473                 RExC_naughty += 2 + RExC_naughty / 2;
10474                 reginsert(pRExC_state, CURLY, ret, depth+1);
10475                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10476                 Set_Node_Cur_Length(ret, parse_start);
10477             }
10478             else {
10479                 regnode * const w = reg_node(pRExC_state, WHILEM);
10480
10481                 w->flags = 0;
10482                 REGTAIL(pRExC_state, ret, w);
10483                 if (!SIZE_ONLY && RExC_extralen) {
10484                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10485                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10486                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10487                 }
10488                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10489                                 /* MJD hk */
10490                 Set_Node_Offset(ret, parse_start+1);
10491                 Set_Node_Length(ret,
10492                                 op == '{' ? (RExC_parse - parse_start) : 1);
10493
10494                 if (!SIZE_ONLY && RExC_extralen)
10495                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10496                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10497                 if (SIZE_ONLY)
10498                     RExC_whilem_seen++, RExC_extralen += 3;
10499                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10500             }
10501             ret->flags = 0;
10502
10503             if (min > 0)
10504                 *flagp = WORST;
10505             if (max > 0)
10506                 *flagp |= HASWIDTH;
10507             if (!SIZE_ONLY) {
10508                 ARG1_SET(ret, (U16)min);
10509                 ARG2_SET(ret, (U16)max);
10510             }
10511             if (max == REG_INFTY)
10512                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10513
10514             goto nest_check;
10515         }
10516     }
10517
10518     if (!ISMULT1(op)) {
10519         *flagp = flags;
10520         return(ret);
10521     }
10522
10523 #if 0                           /* Now runtime fix should be reliable. */
10524
10525     /* if this is reinstated, don't forget to put this back into perldiag:
10526
10527             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10528
10529            (F) The part of the regexp subject to either the * or + quantifier
10530            could match an empty string. The {#} shows in the regular
10531            expression about where the problem was discovered.
10532
10533     */
10534
10535     if (!(flags&HASWIDTH) && op != '?')
10536       vFAIL("Regexp *+ operand could be empty");
10537 #endif
10538
10539 #ifdef RE_TRACK_PATTERN_OFFSETS
10540     parse_start = RExC_parse;
10541 #endif
10542     nextchar(pRExC_state);
10543
10544     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10545
10546     if (op == '*' && (flags&SIMPLE)) {
10547         reginsert(pRExC_state, STAR, ret, depth+1);
10548         ret->flags = 0;
10549         RExC_naughty += 4;
10550         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10551     }
10552     else if (op == '*') {
10553         min = 0;
10554         goto do_curly;
10555     }
10556     else if (op == '+' && (flags&SIMPLE)) {
10557         reginsert(pRExC_state, PLUS, ret, depth+1);
10558         ret->flags = 0;
10559         RExC_naughty += 3;
10560         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10561     }
10562     else if (op == '+') {
10563         min = 1;
10564         goto do_curly;
10565     }
10566     else if (op == '?') {
10567         min = 0; max = 1;
10568         goto do_curly;
10569     }
10570   nest_check:
10571     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10572         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10573         ckWARN2reg(RExC_parse,
10574                    "%"UTF8f" matches null string many times",
10575                    UTF8fARG(UTF, (RExC_parse >= origparse
10576                                  ? RExC_parse - origparse
10577                                  : 0),
10578                    origparse));
10579         (void)ReREFCNT_inc(RExC_rx_sv);
10580     }
10581
10582     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10583         nextchar(pRExC_state);
10584         reginsert(pRExC_state, MINMOD, ret, depth+1);
10585         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10586     }
10587     else
10588     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10589         regnode *ender;
10590         nextchar(pRExC_state);
10591         ender = reg_node(pRExC_state, SUCCEED);
10592         REGTAIL(pRExC_state, ret, ender);
10593         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10594         ret->flags = 0;
10595         ender = reg_node(pRExC_state, TAIL);
10596         REGTAIL(pRExC_state, ret, ender);
10597     }
10598
10599     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10600         RExC_parse++;
10601         vFAIL("Nested quantifiers");
10602     }
10603
10604     return(ret);
10605 }
10606
10607 STATIC bool
10608 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10609                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10610                       const bool strict   /* Apply stricter parsing rules? */
10611     )
10612 {
10613
10614  /* This is expected to be called by a parser routine that has recognized '\N'
10615    and needs to handle the rest. RExC_parse is expected to point at the first
10616    char following the N at the time of the call.  On successful return,
10617    RExC_parse has been updated to point to just after the sequence identified
10618    by this routine, and <*flagp> has been updated.
10619
10620    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10621    character class.
10622
10623    \N may begin either a named sequence, or if outside a character class, mean
10624    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10625    attempted to decide which, and in the case of a named sequence, converted it
10626    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10627    where c1... are the characters in the sequence.  For single-quoted regexes,
10628    the tokenizer passes the \N sequence through unchanged; this code will not
10629    attempt to determine this nor expand those, instead raising a syntax error.
10630    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10631    or there is no '}', it signals that this \N occurrence means to match a
10632    non-newline.
10633
10634    Only the \N{U+...} form should occur in a character class, for the same
10635    reason that '.' inside a character class means to just match a period: it
10636    just doesn't make sense.
10637
10638    The function raises an error (via vFAIL), and doesn't return for various
10639    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10640    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10641    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10642    only possible if node_p is non-NULL.
10643
10644
10645    If <valuep> is non-null, it means the caller can accept an input sequence
10646    consisting of a just a single code point; <*valuep> is set to that value
10647    if the input is such.
10648
10649    If <node_p> is non-null it signifies that the caller can accept any other
10650    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10651    is set as follows:
10652     1) \N means not-a-NL: points to a newly created REG_ANY node;
10653     2) \N{}:              points to a new NOTHING node;
10654     3) otherwise:         points to a new EXACT node containing the resolved
10655                           string.
10656    Note that FALSE is returned for single code point sequences if <valuep> is
10657    null.
10658  */
10659
10660     char * endbrace;    /* '}' following the name */
10661     char* p;
10662     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10663                            stream */
10664     bool has_multiple_chars; /* true if the input stream contains a sequence of
10665                                 more than one character */
10666
10667     GET_RE_DEBUG_FLAGS_DECL;
10668
10669     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10670
10671     GET_RE_DEBUG_FLAGS;
10672
10673     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10674
10675     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10676      * modifier.  The other meaning does not, so use a temporary until we find
10677      * out which we are being called with */
10678     p = (RExC_flags & RXf_PMf_EXTENDED)
10679         ? regwhite( pRExC_state, RExC_parse )
10680         : RExC_parse;
10681
10682     /* Disambiguate between \N meaning a named character versus \N meaning
10683      * [^\n].  The former is assumed when it can't be the latter. */
10684     if (*p != '{' || regcurly(p, FALSE)) {
10685         RExC_parse = p;
10686         if (! node_p) {
10687             /* no bare \N allowed in a charclass */
10688             if (in_char_class) {
10689                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10690             }
10691             return FALSE;
10692         }
10693         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10694                            current char */
10695         nextchar(pRExC_state);
10696         *node_p = reg_node(pRExC_state, REG_ANY);
10697         *flagp |= HASWIDTH|SIMPLE;
10698         RExC_naughty++;
10699         Set_Node_Length(*node_p, 1); /* MJD */
10700         return TRUE;
10701     }
10702
10703     /* Here, we have decided it should be a named character or sequence */
10704
10705     /* The test above made sure that the next real character is a '{', but
10706      * under the /x modifier, it could be separated by space (or a comment and
10707      * \n) and this is not allowed (for consistency with \x{...} and the
10708      * tokenizer handling of \N{NAME}). */
10709     if (*RExC_parse != '{') {
10710         vFAIL("Missing braces on \\N{}");
10711     }
10712
10713     RExC_parse++;       /* Skip past the '{' */
10714
10715     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10716         || ! (endbrace == RExC_parse            /* nothing between the {} */
10717               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10718                                                  */
10719                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10720                                                      */
10721     {
10722         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10723         vFAIL("\\N{NAME} must be resolved by the lexer");
10724     }
10725
10726     if (endbrace == RExC_parse) {   /* empty: \N{} */
10727         bool ret = TRUE;
10728         if (node_p) {
10729             *node_p = reg_node(pRExC_state,NOTHING);
10730         }
10731         else if (in_char_class) {
10732             if (SIZE_ONLY && in_char_class) {
10733                 if (strict) {
10734                     RExC_parse++;   /* Position after the "}" */
10735                     vFAIL("Zero length \\N{}");
10736                 }
10737                 else {
10738                     ckWARNreg(RExC_parse,
10739                               "Ignoring zero length \\N{} in character class");
10740                 }
10741             }
10742             ret = FALSE;
10743         }
10744         else {
10745             return FALSE;
10746         }
10747         nextchar(pRExC_state);
10748         return ret;
10749     }
10750
10751     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10752     RExC_parse += 2;    /* Skip past the 'U+' */
10753
10754     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10755
10756     /* Code points are separated by dots.  If none, there is only one code
10757      * point, and is terminated by the brace */
10758     has_multiple_chars = (endchar < endbrace);
10759
10760     if (valuep && (! has_multiple_chars || in_char_class)) {
10761         /* We only pay attention to the first char of
10762         multichar strings being returned in char classes. I kinda wonder
10763         if this makes sense as it does change the behaviour
10764         from earlier versions, OTOH that behaviour was broken
10765         as well. XXX Solution is to recharacterize as
10766         [rest-of-class]|multi1|multi2... */
10767
10768         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10769         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10770             | PERL_SCAN_DISALLOW_PREFIX
10771             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10772
10773         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10774
10775         /* The tokenizer should have guaranteed validity, but it's possible to
10776          * bypass it by using single quoting, so check */
10777         if (length_of_hex == 0
10778             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10779         {
10780             RExC_parse += length_of_hex;        /* Includes all the valid */
10781             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10782                             ? UTF8SKIP(RExC_parse)
10783                             : 1;
10784             /* Guard against malformed utf8 */
10785             if (RExC_parse >= endchar) {
10786                 RExC_parse = endchar;
10787             }
10788             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10789         }
10790
10791         if (in_char_class && has_multiple_chars) {
10792             if (strict) {
10793                 RExC_parse = endbrace;
10794                 vFAIL("\\N{} in character class restricted to one character");
10795             }
10796             else {
10797                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10798             }
10799         }
10800
10801         RExC_parse = endbrace + 1;
10802     }
10803     else if (! node_p || ! has_multiple_chars) {
10804
10805         /* Here, the input is legal, but not according to the caller's
10806          * options.  We fail without advancing the parse, so that the
10807          * caller can try again */
10808         RExC_parse = p;
10809         return FALSE;
10810     }
10811     else {
10812
10813         /* What is done here is to convert this to a sub-pattern of the form
10814          * (?:\x{char1}\x{char2}...)
10815          * and then call reg recursively.  That way, it retains its atomicness,
10816          * while not having to worry about special handling that some code
10817          * points may have.  toke.c has converted the original Unicode values
10818          * to native, so that we can just pass on the hex values unchanged.  We
10819          * do have to set a flag to keep recoding from happening in the
10820          * recursion */
10821
10822         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10823         STRLEN len;
10824         char *orig_end = RExC_end;
10825         I32 flags;
10826
10827         while (RExC_parse < endbrace) {
10828
10829             /* Convert to notation the rest of the code understands */
10830             sv_catpv(substitute_parse, "\\x{");
10831             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10832             sv_catpv(substitute_parse, "}");
10833
10834             /* Point to the beginning of the next character in the sequence. */
10835             RExC_parse = endchar + 1;
10836             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10837         }
10838         sv_catpv(substitute_parse, ")");
10839
10840         RExC_parse = SvPV(substitute_parse, len);
10841
10842         /* Don't allow empty number */
10843         if (len < 8) {
10844             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10845         }
10846         RExC_end = RExC_parse + len;
10847
10848         /* The values are Unicode, and therefore not subject to recoding */
10849         RExC_override_recoding = 1;
10850
10851         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10852             if (flags & RESTART_UTF8) {
10853                 *flagp = RESTART_UTF8;
10854                 return FALSE;
10855             }
10856             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10857                   (UV) flags);
10858         }
10859         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10860
10861         RExC_parse = endbrace;
10862         RExC_end = orig_end;
10863         RExC_override_recoding = 0;
10864
10865         nextchar(pRExC_state);
10866     }
10867
10868     return TRUE;
10869 }
10870
10871
10872 /*
10873  * reg_recode
10874  *
10875  * It returns the code point in utf8 for the value in *encp.
10876  *    value: a code value in the source encoding
10877  *    encp:  a pointer to an Encode object
10878  *
10879  * If the result from Encode is not a single character,
10880  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10881  */
10882 STATIC UV
10883 S_reg_recode(pTHX_ const char value, SV **encp)
10884 {
10885     STRLEN numlen = 1;
10886     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10887     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10888     const STRLEN newlen = SvCUR(sv);
10889     UV uv = UNICODE_REPLACEMENT;
10890
10891     PERL_ARGS_ASSERT_REG_RECODE;
10892
10893     if (newlen)
10894         uv = SvUTF8(sv)
10895              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10896              : *(U8*)s;
10897
10898     if (!newlen || numlen != newlen) {
10899         uv = UNICODE_REPLACEMENT;
10900         *encp = NULL;
10901     }
10902     return uv;
10903 }
10904
10905 PERL_STATIC_INLINE U8
10906 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10907 {
10908     U8 op;
10909
10910     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10911
10912     if (! FOLD) {
10913         return EXACT;
10914     }
10915
10916     op = get_regex_charset(RExC_flags);
10917     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10918         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10919                  been, so there is no hole */
10920     }
10921
10922     return op + EXACTF;
10923 }
10924
10925 PERL_STATIC_INLINE void
10926 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10927                          regnode *node, I32* flagp, STRLEN len, UV code_point,
10928                          bool downgradable)
10929 {
10930     /* This knows the details about sizing an EXACTish node, setting flags for
10931      * it (by setting <*flagp>, and potentially populating it with a single
10932      * character.
10933      *
10934      * If <len> (the length in bytes) is non-zero, this function assumes that
10935      * the node has already been populated, and just does the sizing.  In this
10936      * case <code_point> should be the final code point that has already been
10937      * placed into the node.  This value will be ignored except that under some
10938      * circumstances <*flagp> is set based on it.
10939      *
10940      * If <len> is zero, the function assumes that the node is to contain only
10941      * the single character given by <code_point> and calculates what <len>
10942      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10943      * additionally will populate the node's STRING with <code_point> or its
10944      * fold if folding.
10945      *
10946      * In both cases <*flagp> is appropriately set
10947      *
10948      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10949      * 255, must be folded (the former only when the rules indicate it can
10950      * match 'ss')
10951      *
10952      * When it does the populating, it looks at the flag 'downgradable'.  If
10953      * true with a node that folds, it checks if the single code point
10954      * participates in a fold, and if not downgrades the node to an EXACT.
10955      * This helps the optimizer */
10956
10957     bool len_passed_in = cBOOL(len != 0);
10958     U8 character[UTF8_MAXBYTES_CASE+1];
10959
10960     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10961
10962     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10963      * sizing difference, and is extra work that is thrown away */
10964     if (downgradable && ! PASS2) {
10965         downgradable = FALSE;
10966     }
10967
10968     if (! len_passed_in) {
10969         if (UTF) {
10970             if (UNI_IS_INVARIANT(code_point)) {
10971                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10972                     *character = (U8) code_point;
10973                 }
10974                 else { /* Here is /i and not /l (toFOLD() is defined on just
10975                           ASCII, which isn't the same thing as INVARIANT on
10976                           EBCDIC, but it works there, as the extra invariants
10977                           fold to themselves) */
10978                     *character = toFOLD((U8) code_point);
10979
10980                     /* We can downgrade to an EXACT node if this character
10981                      * isn't a folding one.  Note that this assumes that
10982                      * nothing above Latin1 folds to some other invariant than
10983                      * one of these alphabetics; otherwise we would also have
10984                      * to check:
10985                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
10986                      *      || ASCII_FOLD_RESTRICTED))
10987                      */
10988                     if (downgradable && PL_fold[code_point] == code_point) {
10989                         OP(node) = EXACT;
10990                     }
10991                 }
10992                 len = 1;
10993             }
10994             else if (FOLD && (! LOC
10995                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10996             {   /* Folding, and ok to do so now */
10997                 UV folded = _to_uni_fold_flags(
10998                                    code_point,
10999                                    character,
11000                                    &len,
11001                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11002                                                       ? FOLD_FLAGS_NOMIX_ASCII
11003                                                       : 0));
11004                 if (downgradable
11005                     && folded == code_point
11006                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11007                 {
11008                     OP(node) = EXACT;
11009                 }
11010             }
11011             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11012
11013                 /* Not folding this cp, and can output it directly */
11014                 *character = UTF8_TWO_BYTE_HI(code_point);
11015                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11016                 len = 2;
11017             }
11018             else {
11019                 uvchr_to_utf8( character, code_point);
11020                 len = UTF8SKIP(character);
11021             }
11022         } /* Else pattern isn't UTF8.  */
11023         else if (! FOLD) {
11024             *character = (U8) code_point;
11025             len = 1;
11026         } /* Else is folded non-UTF8 */
11027         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11028
11029             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11030              * comments at join_exact()); */
11031             *character = (U8) code_point;
11032             len = 1;
11033
11034             /* Can turn into an EXACT node if we know the fold at compile time,
11035              * and it folds to itself and doesn't particpate in other folds */
11036             if (downgradable
11037                 && ! LOC
11038                 && PL_fold_latin1[code_point] == code_point
11039                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11040                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11041             {
11042                 OP(node) = EXACT;
11043             }
11044         } /* else is Sharp s.  May need to fold it */
11045         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11046             *character = 's';
11047             *(character + 1) = 's';
11048             len = 2;
11049         }
11050         else {
11051             *character = LATIN_SMALL_LETTER_SHARP_S;
11052             len = 1;
11053         }
11054     }
11055
11056     if (SIZE_ONLY) {
11057         RExC_size += STR_SZ(len);
11058     }
11059     else {
11060         RExC_emit += STR_SZ(len);
11061         STR_LEN(node) = len;
11062         if (! len_passed_in) {
11063             Copy((char *) character, STRING(node), len, char);
11064         }
11065     }
11066
11067     *flagp |= HASWIDTH;
11068
11069     /* A single character node is SIMPLE, except for the special-cased SHARP S
11070      * under /di. */
11071     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11072         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11073             || ! FOLD || ! DEPENDS_SEMANTICS))
11074     {
11075         *flagp |= SIMPLE;
11076     }
11077
11078     /* The OP may not be well defined in PASS1 */
11079     if (PASS2 && OP(node) == EXACTFL) {
11080         RExC_contains_locale = 1;
11081     }
11082 }
11083
11084
11085 /* return atoi(p), unless it's too big to sensibly be a backref,
11086  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11087
11088 static I32
11089 S_backref_value(char *p)
11090 {
11091     char *q = p;
11092
11093     for (;isDIGIT(*q); q++); /* calculate length of num */
11094     if (q - p == 0 || q - p > 9)
11095         return I32_MAX;
11096     return atoi(p);
11097 }
11098
11099
11100 /*
11101  - regatom - the lowest level
11102
11103    Try to identify anything special at the start of the pattern. If there
11104    is, then handle it as required. This may involve generating a single regop,
11105    such as for an assertion; or it may involve recursing, such as to
11106    handle a () structure.
11107
11108    If the string doesn't start with something special then we gobble up
11109    as much literal text as we can.
11110
11111    Once we have been able to handle whatever type of thing started the
11112    sequence, we return.
11113
11114    Note: we have to be careful with escapes, as they can be both literal
11115    and special, and in the case of \10 and friends, context determines which.
11116
11117    A summary of the code structure is:
11118
11119    switch (first_byte) {
11120         cases for each special:
11121             handle this special;
11122             break;
11123         case '\\':
11124             switch (2nd byte) {
11125                 cases for each unambiguous special:
11126                     handle this special;
11127                     break;
11128                 cases for each ambigous special/literal:
11129                     disambiguate;
11130                     if (special)  handle here
11131                     else goto defchar;
11132                 default: // unambiguously literal:
11133                     goto defchar;
11134             }
11135         default:  // is a literal char
11136             // FALL THROUGH
11137         defchar:
11138             create EXACTish node for literal;
11139             while (more input and node isn't full) {
11140                 switch (input_byte) {
11141                    cases for each special;
11142                        make sure parse pointer is set so that the next call to
11143                            regatom will see this special first
11144                        goto loopdone; // EXACTish node terminated by prev. char
11145                    default:
11146                        append char to EXACTISH node;
11147                 }
11148                 get next input byte;
11149             }
11150         loopdone:
11151    }
11152    return the generated node;
11153
11154    Specifically there are two separate switches for handling
11155    escape sequences, with the one for handling literal escapes requiring
11156    a dummy entry for all of the special escapes that are actually handled
11157    by the other.
11158
11159    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11160    TRYAGAIN.
11161    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11162    restarted.
11163    Otherwise does not return NULL.
11164 */
11165
11166 STATIC regnode *
11167 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11168 {
11169     dVAR;
11170     regnode *ret = NULL;
11171     I32 flags = 0;
11172     char *parse_start = RExC_parse;
11173     U8 op;
11174     int invert = 0;
11175
11176     GET_RE_DEBUG_FLAGS_DECL;
11177
11178     *flagp = WORST;             /* Tentatively. */
11179
11180     DEBUG_PARSE("atom");
11181
11182     PERL_ARGS_ASSERT_REGATOM;
11183
11184 tryagain:
11185     switch ((U8)*RExC_parse) {
11186     case '^':
11187         RExC_seen_zerolen++;
11188         nextchar(pRExC_state);
11189         if (RExC_flags & RXf_PMf_MULTILINE)
11190             ret = reg_node(pRExC_state, MBOL);
11191         else if (RExC_flags & RXf_PMf_SINGLELINE)
11192             ret = reg_node(pRExC_state, SBOL);
11193         else
11194             ret = reg_node(pRExC_state, BOL);
11195         Set_Node_Length(ret, 1); /* MJD */
11196         break;
11197     case '$':
11198         nextchar(pRExC_state);
11199         if (*RExC_parse)
11200             RExC_seen_zerolen++;
11201         if (RExC_flags & RXf_PMf_MULTILINE)
11202             ret = reg_node(pRExC_state, MEOL);
11203         else if (RExC_flags & RXf_PMf_SINGLELINE)
11204             ret = reg_node(pRExC_state, SEOL);
11205         else
11206             ret = reg_node(pRExC_state, EOL);
11207         Set_Node_Length(ret, 1); /* MJD */
11208         break;
11209     case '.':
11210         nextchar(pRExC_state);
11211         if (RExC_flags & RXf_PMf_SINGLELINE)
11212             ret = reg_node(pRExC_state, SANY);
11213         else
11214             ret = reg_node(pRExC_state, REG_ANY);
11215         *flagp |= HASWIDTH|SIMPLE;
11216         RExC_naughty++;
11217         Set_Node_Length(ret, 1); /* MJD */
11218         break;
11219     case '[':
11220     {
11221         char * const oregcomp_parse = ++RExC_parse;
11222         ret = regclass(pRExC_state, flagp,depth+1,
11223                        FALSE, /* means parse the whole char class */
11224                        TRUE, /* allow multi-char folds */
11225                        FALSE, /* don't silence non-portable warnings. */
11226                        NULL);
11227         if (*RExC_parse != ']') {
11228             RExC_parse = oregcomp_parse;
11229             vFAIL("Unmatched [");
11230         }
11231         if (ret == NULL) {
11232             if (*flagp & RESTART_UTF8)
11233                 return NULL;
11234             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11235                   (UV) *flagp);
11236         }
11237         nextchar(pRExC_state);
11238         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11239         break;
11240     }
11241     case '(':
11242         nextchar(pRExC_state);
11243         ret = reg(pRExC_state, 2, &flags,depth+1);
11244         if (ret == NULL) {
11245                 if (flags & TRYAGAIN) {
11246                     if (RExC_parse == RExC_end) {
11247                          /* Make parent create an empty node if needed. */
11248                         *flagp |= TRYAGAIN;
11249                         return(NULL);
11250                     }
11251                     goto tryagain;
11252                 }
11253                 if (flags & RESTART_UTF8) {
11254                     *flagp = RESTART_UTF8;
11255                     return NULL;
11256                 }
11257                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11258                                                                  (UV) flags);
11259         }
11260         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11261         break;
11262     case '|':
11263     case ')':
11264         if (flags & TRYAGAIN) {
11265             *flagp |= TRYAGAIN;
11266             return NULL;
11267         }
11268         vFAIL("Internal urp");
11269                                 /* Supposed to be caught earlier. */
11270         break;
11271     case '{':
11272         if (!regcurly(RExC_parse, FALSE)) {
11273             RExC_parse++;
11274             goto defchar;
11275         }
11276         /* FALL THROUGH */
11277     case '?':
11278     case '+':
11279     case '*':
11280         RExC_parse++;
11281         vFAIL("Quantifier follows nothing");
11282         break;
11283     case '\\':
11284         /* Special Escapes
11285
11286            This switch handles escape sequences that resolve to some kind
11287            of special regop and not to literal text. Escape sequnces that
11288            resolve to literal text are handled below in the switch marked
11289            "Literal Escapes".
11290
11291            Every entry in this switch *must* have a corresponding entry
11292            in the literal escape switch. However, the opposite is not
11293            required, as the default for this switch is to jump to the
11294            literal text handling code.
11295         */
11296         switch ((U8)*++RExC_parse) {
11297             U8 arg;
11298         /* Special Escapes */
11299         case 'A':
11300             RExC_seen_zerolen++;
11301             ret = reg_node(pRExC_state, SBOL);
11302             *flagp |= SIMPLE;
11303             goto finish_meta_pat;
11304         case 'G':
11305             ret = reg_node(pRExC_state, GPOS);
11306             RExC_seen |= REG_GPOS_SEEN;
11307             *flagp |= SIMPLE;
11308             goto finish_meta_pat;
11309         case 'K':
11310             RExC_seen_zerolen++;
11311             ret = reg_node(pRExC_state, KEEPS);
11312             *flagp |= SIMPLE;
11313             /* XXX:dmq : disabling in-place substitution seems to
11314              * be necessary here to avoid cases of memory corruption, as
11315              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11316              */
11317             RExC_seen |= REG_LOOKBEHIND_SEEN;
11318             goto finish_meta_pat;
11319         case 'Z':
11320             ret = reg_node(pRExC_state, SEOL);
11321             *flagp |= SIMPLE;
11322             RExC_seen_zerolen++;                /* Do not optimize RE away */
11323             goto finish_meta_pat;
11324         case 'z':
11325             ret = reg_node(pRExC_state, EOS);
11326             *flagp |= SIMPLE;
11327             RExC_seen_zerolen++;                /* Do not optimize RE away */
11328             goto finish_meta_pat;
11329         case 'C':
11330             ret = reg_node(pRExC_state, CANY);
11331             RExC_seen |= REG_CANY_SEEN;
11332             *flagp |= HASWIDTH|SIMPLE;
11333             goto finish_meta_pat;
11334         case 'X':
11335             ret = reg_node(pRExC_state, CLUMP);
11336             *flagp |= HASWIDTH;
11337             goto finish_meta_pat;
11338
11339         case 'W':
11340             invert = 1;
11341             /* FALLTHROUGH */
11342         case 'w':
11343             arg = ANYOF_WORDCHAR;
11344             goto join_posix;
11345
11346         case 'b':
11347             RExC_seen_zerolen++;
11348             RExC_seen |= REG_LOOKBEHIND_SEEN;
11349             op = BOUND + get_regex_charset(RExC_flags);
11350             if (op > BOUNDA) {  /* /aa is same as /a */
11351                 op = BOUNDA;
11352             }
11353             else if (op == BOUNDL) {
11354                 RExC_contains_locale = 1;
11355             }
11356             ret = reg_node(pRExC_state, op);
11357             FLAGS(ret) = get_regex_charset(RExC_flags);
11358             *flagp |= SIMPLE;
11359             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11360                 /* diag_listed_as: Use "%s" instead of "%s" */
11361                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11362             }
11363             goto finish_meta_pat;
11364         case 'B':
11365             RExC_seen_zerolen++;
11366             RExC_seen |= REG_LOOKBEHIND_SEEN;
11367             op = NBOUND + get_regex_charset(RExC_flags);
11368             if (op > NBOUNDA) { /* /aa is same as /a */
11369                 op = NBOUNDA;
11370             }
11371             else if (op == NBOUNDL) {
11372                 RExC_contains_locale = 1;
11373             }
11374             ret = reg_node(pRExC_state, op);
11375             FLAGS(ret) = get_regex_charset(RExC_flags);
11376             *flagp |= SIMPLE;
11377             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11378                 /* diag_listed_as: Use "%s" instead of "%s" */
11379                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11380             }
11381             goto finish_meta_pat;
11382
11383         case 'D':
11384             invert = 1;
11385             /* FALLTHROUGH */
11386         case 'd':
11387             arg = ANYOF_DIGIT;
11388             goto join_posix;
11389
11390         case 'R':
11391             ret = reg_node(pRExC_state, LNBREAK);
11392             *flagp |= HASWIDTH|SIMPLE;
11393             goto finish_meta_pat;
11394
11395         case 'H':
11396             invert = 1;
11397             /* FALLTHROUGH */
11398         case 'h':
11399             arg = ANYOF_BLANK;
11400             op = POSIXU;
11401             goto join_posix_op_known;
11402
11403         case 'V':
11404             invert = 1;
11405             /* FALLTHROUGH */
11406         case 'v':
11407             arg = ANYOF_VERTWS;
11408             op = POSIXU;
11409             goto join_posix_op_known;
11410
11411         case 'S':
11412             invert = 1;
11413             /* FALLTHROUGH */
11414         case 's':
11415             arg = ANYOF_SPACE;
11416
11417         join_posix:
11418
11419             op = POSIXD + get_regex_charset(RExC_flags);
11420             if (op > POSIXA) {  /* /aa is same as /a */
11421                 op = POSIXA;
11422             }
11423             else if (op == POSIXL) {
11424                 RExC_contains_locale = 1;
11425             }
11426
11427         join_posix_op_known:
11428
11429             if (invert) {
11430                 op += NPOSIXD - POSIXD;
11431             }
11432
11433             ret = reg_node(pRExC_state, op);
11434             if (! SIZE_ONLY) {
11435                 FLAGS(ret) = namedclass_to_classnum(arg);
11436             }
11437
11438             *flagp |= HASWIDTH|SIMPLE;
11439             /* FALL THROUGH */
11440
11441          finish_meta_pat:
11442             nextchar(pRExC_state);
11443             Set_Node_Length(ret, 2); /* MJD */
11444             break;
11445         case 'p':
11446         case 'P':
11447             {
11448 #ifdef DEBUGGING
11449                 char* parse_start = RExC_parse - 2;
11450 #endif
11451
11452                 RExC_parse--;
11453
11454                 ret = regclass(pRExC_state, flagp,depth+1,
11455                                TRUE, /* means just parse this element */
11456                                FALSE, /* don't allow multi-char folds */
11457                                FALSE, /* don't silence non-portable warnings.
11458                                          It would be a bug if these returned
11459                                          non-portables */
11460                                NULL);
11461                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11462                    are allowed.  */
11463                 if (!ret)
11464                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11465                           (UV) *flagp);
11466
11467                 RExC_parse--;
11468
11469                 Set_Node_Offset(ret, parse_start + 2);
11470                 Set_Node_Cur_Length(ret, parse_start);
11471                 nextchar(pRExC_state);
11472             }
11473             break;
11474         case 'N':
11475             /* Handle \N and \N{NAME} with multiple code points here and not
11476              * below because it can be multicharacter. join_exact() will join
11477              * them up later on.  Also this makes sure that things like
11478              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11479              * The options to the grok function call causes it to fail if the
11480              * sequence is just a single code point.  We then go treat it as
11481              * just another character in the current EXACT node, and hence it
11482              * gets uniform treatment with all the other characters.  The
11483              * special treatment for quantifiers is not needed for such single
11484              * character sequences */
11485             ++RExC_parse;
11486             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11487                                 FALSE /* not strict */ )) {
11488                 if (*flagp & RESTART_UTF8)
11489                     return NULL;
11490                 RExC_parse--;
11491                 goto defchar;
11492             }
11493             break;
11494         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11495         parse_named_seq:
11496         {
11497             char ch= RExC_parse[1];
11498             if (ch != '<' && ch != '\'' && ch != '{') {
11499                 RExC_parse++;
11500                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11501                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11502             } else {
11503                 /* this pretty much dupes the code for (?P=...) in reg(), if
11504                    you change this make sure you change that */
11505                 char* name_start = (RExC_parse += 2);
11506                 U32 num = 0;
11507                 SV *sv_dat = reg_scan_name(pRExC_state,
11508                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11509                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11510                 if (RExC_parse == name_start || *RExC_parse != ch)
11511                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11512                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11513
11514                 if (!SIZE_ONLY) {
11515                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11516                     RExC_rxi->data->data[num]=(void*)sv_dat;
11517                     SvREFCNT_inc_simple_void(sv_dat);
11518                 }
11519
11520                 RExC_sawback = 1;
11521                 ret = reganode(pRExC_state,
11522                                ((! FOLD)
11523                                  ? NREF
11524                                  : (ASCII_FOLD_RESTRICTED)
11525                                    ? NREFFA
11526                                    : (AT_LEAST_UNI_SEMANTICS)
11527                                      ? NREFFU
11528                                      : (LOC)
11529                                        ? NREFFL
11530                                        : NREFF),
11531                                 num);
11532                 *flagp |= HASWIDTH;
11533
11534                 /* override incorrect value set in reganode MJD */
11535                 Set_Node_Offset(ret, parse_start+1);
11536                 Set_Node_Cur_Length(ret, parse_start);
11537                 nextchar(pRExC_state);
11538
11539             }
11540             break;
11541         }
11542         case 'g':
11543         case '1': case '2': case '3': case '4':
11544         case '5': case '6': case '7': case '8': case '9':
11545             {
11546                 I32 num;
11547                 bool hasbrace = 0;
11548
11549                 if (*RExC_parse == 'g') {
11550                     bool isrel = 0;
11551
11552                     RExC_parse++;
11553                     if (*RExC_parse == '{') {
11554                         RExC_parse++;
11555                         hasbrace = 1;
11556                     }
11557                     if (*RExC_parse == '-') {
11558                         RExC_parse++;
11559                         isrel = 1;
11560                     }
11561                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11562                         if (isrel) RExC_parse--;
11563                         RExC_parse -= 2;
11564                         goto parse_named_seq;
11565                     }
11566
11567                     num = S_backref_value(RExC_parse);
11568                     if (num == 0)
11569                         vFAIL("Reference to invalid group 0");
11570                     else if (num == I32_MAX) {
11571                          if (isDIGIT(*RExC_parse))
11572                             vFAIL("Reference to nonexistent group");
11573                         else
11574                             vFAIL("Unterminated \\g... pattern");
11575                     }
11576
11577                     if (isrel) {
11578                         num = RExC_npar - num;
11579                         if (num < 1)
11580                             vFAIL("Reference to nonexistent or unclosed group");
11581                     }
11582                 }
11583                 else {
11584                     num = S_backref_value(RExC_parse);
11585                     /* bare \NNN might be backref or octal - if it is larger than or equal
11586                      * RExC_npar then it is assumed to be and octal escape.
11587                      * Note RExC_npar is +1 from the actual number of parens*/
11588                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11589                             && *RExC_parse != '8' && *RExC_parse != '9'))
11590                     {
11591                         /* Probably a character specified in octal, e.g. \35 */
11592                         goto defchar;
11593                     }
11594                 }
11595
11596                 /* at this point RExC_parse definitely points to a backref
11597                  * number */
11598                 {
11599 #ifdef RE_TRACK_PATTERN_OFFSETS
11600                     char * const parse_start = RExC_parse - 1; /* MJD */
11601 #endif
11602                     while (isDIGIT(*RExC_parse))
11603                         RExC_parse++;
11604                     if (hasbrace) {
11605                         if (*RExC_parse != '}')
11606                             vFAIL("Unterminated \\g{...} pattern");
11607                         RExC_parse++;
11608                     }
11609                     if (!SIZE_ONLY) {
11610                         if (num > (I32)RExC_rx->nparens)
11611                             vFAIL("Reference to nonexistent group");
11612                     }
11613                     RExC_sawback = 1;
11614                     ret = reganode(pRExC_state,
11615                                    ((! FOLD)
11616                                      ? REF
11617                                      : (ASCII_FOLD_RESTRICTED)
11618                                        ? REFFA
11619                                        : (AT_LEAST_UNI_SEMANTICS)
11620                                          ? REFFU
11621                                          : (LOC)
11622                                            ? REFFL
11623                                            : REFF),
11624                                     num);
11625                     *flagp |= HASWIDTH;
11626
11627                     /* override incorrect value set in reganode MJD */
11628                     Set_Node_Offset(ret, parse_start+1);
11629                     Set_Node_Cur_Length(ret, parse_start);
11630                     RExC_parse--;
11631                     nextchar(pRExC_state);
11632                 }
11633             }
11634             break;
11635         case '\0':
11636             if (RExC_parse >= RExC_end)
11637                 FAIL("Trailing \\");
11638             /* FALL THROUGH */
11639         default:
11640             /* Do not generate "unrecognized" warnings here, we fall
11641                back into the quick-grab loop below */
11642             parse_start--;
11643             goto defchar;
11644         }
11645         break;
11646
11647     case '#':
11648         if (RExC_flags & RXf_PMf_EXTENDED) {
11649             if ( reg_skipcomment( pRExC_state ) )
11650                 goto tryagain;
11651         }
11652         /* FALL THROUGH */
11653
11654     default:
11655
11656             parse_start = RExC_parse - 1;
11657
11658             RExC_parse++;
11659
11660         defchar: {
11661             STRLEN len = 0;
11662             UV ender = 0;
11663             char *p;
11664             char *s;
11665 #define MAX_NODE_STRING_SIZE 127
11666             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11667             char *s0;
11668             U8 upper_parse = MAX_NODE_STRING_SIZE;
11669             U8 node_type = compute_EXACTish(pRExC_state);
11670             bool next_is_quantifier;
11671             char * oldp = NULL;
11672
11673             /* We can convert EXACTF nodes to EXACTFU if they contain only
11674              * characters that match identically regardless of the target
11675              * string's UTF8ness.  The reason to do this is that EXACTF is not
11676              * trie-able, EXACTFU is.
11677              *
11678              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11679              * contain only above-Latin1 characters (hence must be in UTF8),
11680              * which don't participate in folds with Latin1-range characters,
11681              * as the latter's folds aren't known until runtime.  (We don't
11682              * need to figure this out until pass 2) */
11683             bool maybe_exactfu = PASS2
11684                                && (node_type == EXACTF || node_type == EXACTFL);
11685
11686             /* If a folding node contains only code points that don't
11687              * participate in folds, it can be changed into an EXACT node,
11688              * which allows the optimizer more things to look for */
11689             bool maybe_exact;
11690
11691             ret = reg_node(pRExC_state, node_type);
11692
11693             /* In pass1, folded, we use a temporary buffer instead of the
11694              * actual node, as the node doesn't exist yet */
11695             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11696
11697             s0 = s;
11698
11699         reparse:
11700
11701             /* We do the EXACTFish to EXACT node only if folding.  (And we
11702              * don't need to figure this out until pass 2) */
11703             maybe_exact = FOLD && PASS2;
11704
11705             /* XXX The node can hold up to 255 bytes, yet this only goes to
11706              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11707              * 255 allows us to not have to worry about overflow due to
11708              * converting to utf8 and fold expansion, but that value is
11709              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11710              * split up by this limit into a single one using the real max of
11711              * 255.  Even at 127, this breaks under rare circumstances.  If
11712              * folding, we do not want to split a node at a character that is a
11713              * non-final in a multi-char fold, as an input string could just
11714              * happen to want to match across the node boundary.  The join
11715              * would solve that problem if the join actually happens.  But a
11716              * series of more than two nodes in a row each of 127 would cause
11717              * the first join to succeed to get to 254, but then there wouldn't
11718              * be room for the next one, which could at be one of those split
11719              * multi-char folds.  I don't know of any fool-proof solution.  One
11720              * could back off to end with only a code point that isn't such a
11721              * non-final, but it is possible for there not to be any in the
11722              * entire node. */
11723             for (p = RExC_parse - 1;
11724                  len < upper_parse && p < RExC_end;
11725                  len++)
11726             {
11727                 oldp = p;
11728
11729                 if (RExC_flags & RXf_PMf_EXTENDED)
11730                     p = regwhite( pRExC_state, p );
11731                 switch ((U8)*p) {
11732                 case '^':
11733                 case '$':
11734                 case '.':
11735                 case '[':
11736                 case '(':
11737                 case ')':
11738                 case '|':
11739                     goto loopdone;
11740                 case '\\':
11741                     /* Literal Escapes Switch
11742
11743                        This switch is meant to handle escape sequences that
11744                        resolve to a literal character.
11745
11746                        Every escape sequence that represents something
11747                        else, like an assertion or a char class, is handled
11748                        in the switch marked 'Special Escapes' above in this
11749                        routine, but also has an entry here as anything that
11750                        isn't explicitly mentioned here will be treated as
11751                        an unescaped equivalent literal.
11752                     */
11753
11754                     switch ((U8)*++p) {
11755                     /* These are all the special escapes. */
11756                     case 'A':             /* Start assertion */
11757                     case 'b': case 'B':   /* Word-boundary assertion*/
11758                     case 'C':             /* Single char !DANGEROUS! */
11759                     case 'd': case 'D':   /* digit class */
11760                     case 'g': case 'G':   /* generic-backref, pos assertion */
11761                     case 'h': case 'H':   /* HORIZWS */
11762                     case 'k': case 'K':   /* named backref, keep marker */
11763                     case 'p': case 'P':   /* Unicode property */
11764                               case 'R':   /* LNBREAK */
11765                     case 's': case 'S':   /* space class */
11766                     case 'v': case 'V':   /* VERTWS */
11767                     case 'w': case 'W':   /* word class */
11768                     case 'X':             /* eXtended Unicode "combining
11769                                              character sequence" */
11770                     case 'z': case 'Z':   /* End of line/string assertion */
11771                         --p;
11772                         goto loopdone;
11773
11774                     /* Anything after here is an escape that resolves to a
11775                        literal. (Except digits, which may or may not)
11776                      */
11777                     case 'n':
11778                         ender = '\n';
11779                         p++;
11780                         break;
11781                     case 'N': /* Handle a single-code point named character. */
11782                         /* The options cause it to fail if a multiple code
11783                          * point sequence.  Handle those in the switch() above
11784                          * */
11785                         RExC_parse = p + 1;
11786                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11787                                             flagp, depth, FALSE,
11788                                             FALSE /* not strict */ ))
11789                         {
11790                             if (*flagp & RESTART_UTF8)
11791                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11792                             RExC_parse = p = oldp;
11793                             goto loopdone;
11794                         }
11795                         p = RExC_parse;
11796                         if (ender > 0xff) {
11797                             REQUIRE_UTF8;
11798                         }
11799                         break;
11800                     case 'r':
11801                         ender = '\r';
11802                         p++;
11803                         break;
11804                     case 't':
11805                         ender = '\t';
11806                         p++;
11807                         break;
11808                     case 'f':
11809                         ender = '\f';
11810                         p++;
11811                         break;
11812                     case 'e':
11813                           ender = ASCII_TO_NATIVE('\033');
11814                         p++;
11815                         break;
11816                     case 'a':
11817                           ender = '\a';
11818                         p++;
11819                         break;
11820                     case 'o':
11821                         {
11822                             UV result;
11823                             const char* error_msg;
11824
11825                             bool valid = grok_bslash_o(&p,
11826                                                        &result,
11827                                                        &error_msg,
11828                                                        TRUE, /* out warnings */
11829                                                        FALSE, /* not strict */
11830                                                        TRUE, /* Output warnings
11831                                                                 for non-
11832                                                                 portables */
11833                                                        UTF);
11834                             if (! valid) {
11835                                 RExC_parse = p; /* going to die anyway; point
11836                                                    to exact spot of failure */
11837                                 vFAIL(error_msg);
11838                             }
11839                             ender = result;
11840                             if (PL_encoding && ender < 0x100) {
11841                                 goto recode_encoding;
11842                             }
11843                             if (ender > 0xff) {
11844                                 REQUIRE_UTF8;
11845                             }
11846                             break;
11847                         }
11848                     case 'x':
11849                         {
11850                             UV result = UV_MAX; /* initialize to erroneous
11851                                                    value */
11852                             const char* error_msg;
11853
11854                             bool valid = grok_bslash_x(&p,
11855                                                        &result,
11856                                                        &error_msg,
11857                                                        TRUE, /* out warnings */
11858                                                        FALSE, /* not strict */
11859                                                        TRUE, /* Output warnings
11860                                                                 for non-
11861                                                                 portables */
11862                                                        UTF);
11863                             if (! valid) {
11864                                 RExC_parse = p; /* going to die anyway; point
11865                                                    to exact spot of failure */
11866                                 vFAIL(error_msg);
11867                             }
11868                             ender = result;
11869
11870                             if (PL_encoding && ender < 0x100) {
11871                                 goto recode_encoding;
11872                             }
11873                             if (ender > 0xff) {
11874                                 REQUIRE_UTF8;
11875                             }
11876                             break;
11877                         }
11878                     case 'c':
11879                         p++;
11880                         ender = grok_bslash_c(*p++, SIZE_ONLY);
11881                         break;
11882                     case '8': case '9': /* must be a backreference */
11883                         --p;
11884                         goto loopdone;
11885                     case '1': case '2': case '3':case '4':
11886                     case '5': case '6': case '7':
11887                         /* When we parse backslash escapes there is ambiguity
11888                          * between backreferences and octal escapes. Any escape
11889                          * from \1 - \9 is a backreference, any multi-digit
11890                          * escape which does not start with 0 and which when
11891                          * evaluated as decimal could refer to an already
11892                          * parsed capture buffer is a backslash. Anything else
11893                          * is octal.
11894                          *
11895                          * Note this implies that \118 could be interpreted as
11896                          * 118 OR as "\11" . "8" depending on whether there
11897                          * were 118 capture buffers defined already in the
11898                          * pattern.  */
11899
11900                         /* NOTE, RExC_npar is 1 more than the actual number of
11901                          * parens we have seen so far, hence the < RExC_npar below. */
11902
11903                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11904                         {  /* Not to be treated as an octal constant, go
11905                                    find backref */
11906                             --p;
11907                             goto loopdone;
11908                         }
11909                     case '0':
11910                         {
11911                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11912                             STRLEN numlen = 3;
11913                             ender = grok_oct(p, &numlen, &flags, NULL);
11914                             if (ender > 0xff) {
11915                                 REQUIRE_UTF8;
11916                             }
11917                             p += numlen;
11918                             if (SIZE_ONLY   /* like \08, \178 */
11919                                 && numlen < 3
11920                                 && p < RExC_end
11921                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11922                             {
11923                                 reg_warn_non_literal_string(
11924                                          p + 1,
11925                                          form_short_octal_warning(p, numlen));
11926                             }
11927                         }
11928                         if (PL_encoding && ender < 0x100)
11929                             goto recode_encoding;
11930                         break;
11931                     recode_encoding:
11932                         if (! RExC_override_recoding) {
11933                             SV* enc = PL_encoding;
11934                             ender = reg_recode((const char)(U8)ender, &enc);
11935                             if (!enc && SIZE_ONLY)
11936                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11937                             REQUIRE_UTF8;
11938                         }
11939                         break;
11940                     case '\0':
11941                         if (p >= RExC_end)
11942                             FAIL("Trailing \\");
11943                         /* FALL THROUGH */
11944                     default:
11945                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11946                             /* Include any { following the alpha to emphasize
11947                              * that it could be part of an escape at some point
11948                              * in the future */
11949                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11950                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11951                         }
11952                         goto normal_default;
11953                     } /* End of switch on '\' */
11954                     break;
11955                 default:    /* A literal character */
11956
11957                     if (! SIZE_ONLY
11958                         && RExC_flags & RXf_PMf_EXTENDED
11959                         && ckWARN_d(WARN_DEPRECATED)
11960                         && is_PATWS_non_low_safe(p, RExC_end, UTF))
11961                     {
11962                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11963                                 "Escape literal pattern white space under /x");
11964                     }
11965
11966                   normal_default:
11967                     if (UTF8_IS_START(*p) && UTF) {
11968                         STRLEN numlen;
11969                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11970                                                &numlen, UTF8_ALLOW_DEFAULT);
11971                         p += numlen;
11972                     }
11973                     else
11974                         ender = (U8) *p++;
11975                     break;
11976                 } /* End of switch on the literal */
11977
11978                 /* Here, have looked at the literal character and <ender>
11979                  * contains its ordinal, <p> points to the character after it
11980                  */
11981
11982                 if ( RExC_flags & RXf_PMf_EXTENDED)
11983                     p = regwhite( pRExC_state, p );
11984
11985                 /* If the next thing is a quantifier, it applies to this
11986                  * character only, which means that this character has to be in
11987                  * its own node and can't just be appended to the string in an
11988                  * existing node, so if there are already other characters in
11989                  * the node, close the node with just them, and set up to do
11990                  * this character again next time through, when it will be the
11991                  * only thing in its new node */
11992                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11993                 {
11994                     p = oldp;
11995                     goto loopdone;
11996                 }
11997
11998                 if (! FOLD   /* The simple case, just append the literal */
11999                     || (LOC  /* Also don't fold for tricky chars under /l */
12000                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12001                 {
12002                     if (UTF) {
12003                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12004                         if (unilen > 0) {
12005                            s   += unilen;
12006                            len += unilen;
12007                         }
12008
12009                         /* The loop increments <len> each time, as all but this
12010                          * path (and one other) through it add a single byte to
12011                          * the EXACTish node.  But this one has changed len to
12012                          * be the correct final value, so subtract one to
12013                          * cancel out the increment that follows */
12014                         len--;
12015                     }
12016                     else {
12017                         REGC((char)ender, s++);
12018                     }
12019
12020                     /* Can get here if folding only if is one of the /l
12021                      * characters whose fold depends on the locale.  The
12022                      * occurrence of any of these indicate that we can't
12023                      * simplify things */
12024                     if (FOLD) {
12025                         maybe_exact = FALSE;
12026                         maybe_exactfu = FALSE;
12027                     }
12028                 }
12029                 else             /* FOLD */
12030                      if (! ( UTF
12031                         /* See comments for join_exact() as to why we fold this
12032                          * non-UTF at compile time */
12033                         || (node_type == EXACTFU
12034                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12035                 {
12036                     /* Here, are folding and are not UTF-8 encoded; therefore
12037                      * the character must be in the range 0-255, and is not /l
12038                      * (Not /l because we already handled these under /l in
12039                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12040                     if (IS_IN_SOME_FOLD_L1(ender)) {
12041                         maybe_exact = FALSE;
12042
12043                         /* See if the character's fold differs between /d and
12044                          * /u.  This includes the multi-char fold SHARP S to
12045                          * 'ss' */
12046                         if (maybe_exactfu
12047                             && (PL_fold[ender] != PL_fold_latin1[ender]
12048                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12049                                 || (len > 0
12050                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12051                                    && isARG2_lower_or_UPPER_ARG1('s',
12052                                                                  *(s-1)))))
12053                         {
12054                             maybe_exactfu = FALSE;
12055                         }
12056                     }
12057
12058                     /* Even when folding, we store just the input character, as
12059                      * we have an array that finds its fold quickly */
12060                     *(s++) = (char) ender;
12061                 }
12062                 else {  /* FOLD and UTF */
12063                     /* Unlike the non-fold case, we do actually have to
12064                      * calculate the results here in pass 1.  This is for two
12065                      * reasons, the folded length may be longer than the
12066                      * unfolded, and we have to calculate how many EXACTish
12067                      * nodes it will take; and we may run out of room in a node
12068                      * in the middle of a potential multi-char fold, and have
12069                      * to back off accordingly.  (Hence we can't use REGC for
12070                      * the simple case just below.) */
12071
12072                     UV folded;
12073                     if (isASCII(ender)) {
12074                         folded = toFOLD(ender);
12075                         *(s)++ = (U8) folded;
12076                     }
12077                     else {
12078                         STRLEN foldlen;
12079
12080                         folded = _to_uni_fold_flags(
12081                                      ender,
12082                                      (U8 *) s,
12083                                      &foldlen,
12084                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12085                                                         ? FOLD_FLAGS_NOMIX_ASCII
12086                                                         : 0));
12087                         s += foldlen;
12088
12089                         /* The loop increments <len> each time, as all but this
12090                          * path (and one other) through it add a single byte to
12091                          * the EXACTish node.  But this one has changed len to
12092                          * be the correct final value, so subtract one to
12093                          * cancel out the increment that follows */
12094                         len += foldlen - 1;
12095                     }
12096                     /* If this node only contains non-folding code points so
12097                      * far, see if this new one is also non-folding */
12098                     if (maybe_exact) {
12099                         if (folded != ender) {
12100                             maybe_exact = FALSE;
12101                         }
12102                         else {
12103                             /* Here the fold is the original; we have to check
12104                              * further to see if anything folds to it */
12105                             if (_invlist_contains_cp(PL_utf8_foldable,
12106                                                         ender))
12107                             {
12108                                 maybe_exact = FALSE;
12109                             }
12110                         }
12111                     }
12112                     ender = folded;
12113                 }
12114
12115                 if (next_is_quantifier) {
12116
12117                     /* Here, the next input is a quantifier, and to get here,
12118                      * the current character is the only one in the node.
12119                      * Also, here <len> doesn't include the final byte for this
12120                      * character */
12121                     len++;
12122                     goto loopdone;
12123                 }
12124
12125             } /* End of loop through literal characters */
12126
12127             /* Here we have either exhausted the input or ran out of room in
12128              * the node.  (If we encountered a character that can't be in the
12129              * node, transfer is made directly to <loopdone>, and so we
12130              * wouldn't have fallen off the end of the loop.)  In the latter
12131              * case, we artificially have to split the node into two, because
12132              * we just don't have enough space to hold everything.  This
12133              * creates a problem if the final character participates in a
12134              * multi-character fold in the non-final position, as a match that
12135              * should have occurred won't, due to the way nodes are matched,
12136              * and our artificial boundary.  So back off until we find a non-
12137              * problematic character -- one that isn't at the beginning or
12138              * middle of such a fold.  (Either it doesn't participate in any
12139              * folds, or appears only in the final position of all the folds it
12140              * does participate in.)  A better solution with far fewer false
12141              * positives, and that would fill the nodes more completely, would
12142              * be to actually have available all the multi-character folds to
12143              * test against, and to back-off only far enough to be sure that
12144              * this node isn't ending with a partial one.  <upper_parse> is set
12145              * further below (if we need to reparse the node) to include just
12146              * up through that final non-problematic character that this code
12147              * identifies, so when it is set to less than the full node, we can
12148              * skip the rest of this */
12149             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12150
12151                 const STRLEN full_len = len;
12152
12153                 assert(len >= MAX_NODE_STRING_SIZE);
12154
12155                 /* Here, <s> points to the final byte of the final character.
12156                  * Look backwards through the string until find a non-
12157                  * problematic character */
12158
12159                 if (! UTF) {
12160
12161                     /* This has no multi-char folds to non-UTF characters */
12162                     if (ASCII_FOLD_RESTRICTED) {
12163                         goto loopdone;
12164                     }
12165
12166                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12167                     len = s - s0 + 1;
12168                 }
12169                 else {
12170                     if (!  PL_NonL1NonFinalFold) {
12171                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12172                                         NonL1_Perl_Non_Final_Folds_invlist);
12173                     }
12174
12175                     /* Point to the first byte of the final character */
12176                     s = (char *) utf8_hop((U8 *) s, -1);
12177
12178                     while (s >= s0) {   /* Search backwards until find
12179                                            non-problematic char */
12180                         if (UTF8_IS_INVARIANT(*s)) {
12181
12182                             /* There are no ascii characters that participate
12183                              * in multi-char folds under /aa.  In EBCDIC, the
12184                              * non-ascii invariants are all control characters,
12185                              * so don't ever participate in any folds. */
12186                             if (ASCII_FOLD_RESTRICTED
12187                                 || ! IS_NON_FINAL_FOLD(*s))
12188                             {
12189                                 break;
12190                             }
12191                         }
12192                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12193                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12194                                                                   *s, *(s+1))))
12195                             {
12196                                 break;
12197                             }
12198                         }
12199                         else if (! _invlist_contains_cp(
12200                                         PL_NonL1NonFinalFold,
12201                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12202                         {
12203                             break;
12204                         }
12205
12206                         /* Here, the current character is problematic in that
12207                          * it does occur in the non-final position of some
12208                          * fold, so try the character before it, but have to
12209                          * special case the very first byte in the string, so
12210                          * we don't read outside the string */
12211                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12212                     } /* End of loop backwards through the string */
12213
12214                     /* If there were only problematic characters in the string,
12215                      * <s> will point to before s0, in which case the length
12216                      * should be 0, otherwise include the length of the
12217                      * non-problematic character just found */
12218                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12219                 }
12220
12221                 /* Here, have found the final character, if any, that is
12222                  * non-problematic as far as ending the node without splitting
12223                  * it across a potential multi-char fold.  <len> contains the
12224                  * number of bytes in the node up-to and including that
12225                  * character, or is 0 if there is no such character, meaning
12226                  * the whole node contains only problematic characters.  In
12227                  * this case, give up and just take the node as-is.  We can't
12228                  * do any better */
12229                 if (len == 0) {
12230                     len = full_len;
12231
12232                     /* If the node ends in an 's' we make sure it stays EXACTF,
12233                      * as if it turns into an EXACTFU, it could later get
12234                      * joined with another 's' that would then wrongly match
12235                      * the sharp s */
12236                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12237                     {
12238                         maybe_exactfu = FALSE;
12239                     }
12240                 } else {
12241
12242                     /* Here, the node does contain some characters that aren't
12243                      * problematic.  If one such is the final character in the
12244                      * node, we are done */
12245                     if (len == full_len) {
12246                         goto loopdone;
12247                     }
12248                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12249
12250                         /* If the final character is problematic, but the
12251                          * penultimate is not, back-off that last character to
12252                          * later start a new node with it */
12253                         p = oldp;
12254                         goto loopdone;
12255                     }
12256
12257                     /* Here, the final non-problematic character is earlier
12258                      * in the input than the penultimate character.  What we do
12259                      * is reparse from the beginning, going up only as far as
12260                      * this final ok one, thus guaranteeing that the node ends
12261                      * in an acceptable character.  The reason we reparse is
12262                      * that we know how far in the character is, but we don't
12263                      * know how to correlate its position with the input parse.
12264                      * An alternate implementation would be to build that
12265                      * correlation as we go along during the original parse,
12266                      * but that would entail extra work for every node, whereas
12267                      * this code gets executed only when the string is too
12268                      * large for the node, and the final two characters are
12269                      * problematic, an infrequent occurrence.  Yet another
12270                      * possible strategy would be to save the tail of the
12271                      * string, and the next time regatom is called, initialize
12272                      * with that.  The problem with this is that unless you
12273                      * back off one more character, you won't be guaranteed
12274                      * regatom will get called again, unless regbranch,
12275                      * regpiece ... are also changed.  If you do back off that
12276                      * extra character, so that there is input guaranteed to
12277                      * force calling regatom, you can't handle the case where
12278                      * just the first character in the node is acceptable.  I
12279                      * (khw) decided to try this method which doesn't have that
12280                      * pitfall; if performance issues are found, we can do a
12281                      * combination of the current approach plus that one */
12282                     upper_parse = len;
12283                     len = 0;
12284                     s = s0;
12285                     goto reparse;
12286                 }
12287             }   /* End of verifying node ends with an appropriate char */
12288
12289         loopdone:   /* Jumped to when encounters something that shouldn't be in
12290                        the node */
12291
12292             /* I (khw) don't know if you can get here with zero length, but the
12293              * old code handled this situation by creating a zero-length EXACT
12294              * node.  Might as well be NOTHING instead */
12295             if (len == 0) {
12296                 OP(ret) = NOTHING;
12297             }
12298             else {
12299                 if (FOLD) {
12300                     /* If 'maybe_exact' is still set here, means there are no
12301                      * code points in the node that participate in folds;
12302                      * similarly for 'maybe_exactfu' and code points that match
12303                      * differently depending on UTF8ness of the target string
12304                      * (for /u), or depending on locale for /l */
12305                     if (maybe_exact) {
12306                         OP(ret) = EXACT;
12307                     }
12308                     else if (maybe_exactfu) {
12309                         OP(ret) = EXACTFU;
12310                     }
12311                 }
12312                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12313                                            FALSE /* Don't look to see if could
12314                                                     be turned into an EXACT
12315                                                     node, as we have already
12316                                                     computed that */
12317                                           );
12318             }
12319
12320             RExC_parse = p - 1;
12321             Set_Node_Cur_Length(ret, parse_start);
12322             nextchar(pRExC_state);
12323             {
12324                 /* len is STRLEN which is unsigned, need to copy to signed */
12325                 IV iv = len;
12326                 if (iv < 0)
12327                     vFAIL("Internal disaster");
12328             }
12329
12330         } /* End of label 'defchar:' */
12331         break;
12332     } /* End of giant switch on input character */
12333
12334     return(ret);
12335 }
12336
12337 STATIC char *
12338 S_regwhite( RExC_state_t *pRExC_state, char *p )
12339 {
12340     const char *e = RExC_end;
12341
12342     PERL_ARGS_ASSERT_REGWHITE;
12343
12344     while (p < e) {
12345         if (isSPACE(*p))
12346             ++p;
12347         else if (*p == '#') {
12348             bool ended = 0;
12349             do {
12350                 if (*p++ == '\n') {
12351                     ended = 1;
12352                     break;
12353                 }
12354             } while (p < e);
12355             if (!ended)
12356                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12357         }
12358         else
12359             break;
12360     }
12361     return p;
12362 }
12363
12364 STATIC char *
12365 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12366 {
12367     /* Returns the next non-pattern-white space, non-comment character (the
12368      * latter only if 'recognize_comment is true) in the string p, which is
12369      * ended by RExC_end.  If there is no line break ending a comment,
12370      * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12371     const char *e = RExC_end;
12372
12373     PERL_ARGS_ASSERT_REGPATWS;
12374
12375     while (p < e) {
12376         STRLEN len;
12377         if ((len = is_PATWS_safe(p, e, UTF))) {
12378             p += len;
12379         }
12380         else if (recognize_comment && *p == '#') {
12381             bool ended = 0;
12382             do {
12383                 p++;
12384                 if (is_LNBREAK_safe(p, e, UTF)) {
12385                     ended = 1;
12386                     break;
12387                 }
12388             } while (p < e);
12389             if (!ended)
12390                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12391         }
12392         else
12393             break;
12394     }
12395     return p;
12396 }
12397
12398 STATIC void
12399 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12400 {
12401     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12402      * sets up the bitmap and any flags, removing those code points from the
12403      * inversion list, setting it to NULL should it become completely empty */
12404
12405     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12406     assert(PL_regkind[OP(node)] == ANYOF);
12407
12408     ANYOF_BITMAP_ZERO(node);
12409     if (*invlist_ptr) {
12410
12411         /* This gets set if we actually need to modify things */
12412         bool change_invlist = FALSE;
12413
12414         UV start, end;
12415
12416         /* Start looking through *invlist_ptr */
12417         invlist_iterinit(*invlist_ptr);
12418         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12419             UV high;
12420             int i;
12421
12422             if (end == UV_MAX && start <= 256) {
12423                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12424             }
12425             else if (end >= 256) {
12426                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12427             }
12428
12429             /* Quit if are above what we should change */
12430             if (start > 255) {
12431                 break;
12432             }
12433
12434             change_invlist = TRUE;
12435
12436             /* Set all the bits in the range, up to the max that we are doing */
12437             high = (end < 255) ? end : 255;
12438             for (i = start; i <= (int) high; i++) {
12439                 if (! ANYOF_BITMAP_TEST(node, i)) {
12440                     ANYOF_BITMAP_SET(node, i);
12441                 }
12442             }
12443         }
12444         invlist_iterfinish(*invlist_ptr);
12445
12446         /* Done with loop; remove any code points that are in the bitmap from
12447          * *invlist_ptr; similarly for code points above latin1 if we have a
12448          * flag to match all of them anyways */
12449         if (change_invlist) {
12450             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12451         }
12452         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12453             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12454         }
12455
12456         /* If have completely emptied it, remove it completely */
12457         if (_invlist_len(*invlist_ptr) == 0) {
12458             SvREFCNT_dec_NN(*invlist_ptr);
12459             *invlist_ptr = NULL;
12460         }
12461     }
12462 }
12463
12464 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12465    Character classes ([:foo:]) can also be negated ([:^foo:]).
12466    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12467    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12468    but trigger failures because they are currently unimplemented. */
12469
12470 #define POSIXCC_DONE(c)   ((c) == ':')
12471 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12472 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12473
12474 PERL_STATIC_INLINE I32
12475 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12476 {
12477     dVAR;
12478     I32 namedclass = OOB_NAMEDCLASS;
12479
12480     PERL_ARGS_ASSERT_REGPPOSIXCC;
12481
12482     if (value == '[' && RExC_parse + 1 < RExC_end &&
12483         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12484         POSIXCC(UCHARAT(RExC_parse)))
12485     {
12486         const char c = UCHARAT(RExC_parse);
12487         char* const s = RExC_parse++;
12488
12489         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12490             RExC_parse++;
12491         if (RExC_parse == RExC_end) {
12492             if (strict) {
12493
12494                 /* Try to give a better location for the error (than the end of
12495                  * the string) by looking for the matching ']' */
12496                 RExC_parse = s;
12497                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12498                     RExC_parse++;
12499                 }
12500                 vFAIL2("Unmatched '%c' in POSIX class", c);
12501             }
12502             /* Grandfather lone [:, [=, [. */
12503             RExC_parse = s;
12504         }
12505         else {
12506             const char* const t = RExC_parse++; /* skip over the c */
12507             assert(*t == c);
12508
12509             if (UCHARAT(RExC_parse) == ']') {
12510                 const char *posixcc = s + 1;
12511                 RExC_parse++; /* skip over the ending ] */
12512
12513                 if (*s == ':') {
12514                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12515                     const I32 skip = t - posixcc;
12516
12517                     /* Initially switch on the length of the name.  */
12518                     switch (skip) {
12519                     case 4:
12520                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12521                                                           this is the Perl \w
12522                                                         */
12523                             namedclass = ANYOF_WORDCHAR;
12524                         break;
12525                     case 5:
12526                         /* Names all of length 5.  */
12527                         /* alnum alpha ascii blank cntrl digit graph lower
12528                            print punct space upper  */
12529                         /* Offset 4 gives the best switch position.  */
12530                         switch (posixcc[4]) {
12531                         case 'a':
12532                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12533                                 namedclass = ANYOF_ALPHA;
12534                             break;
12535                         case 'e':
12536                             if (memEQ(posixcc, "spac", 4)) /* space */
12537                                 namedclass = ANYOF_PSXSPC;
12538                             break;
12539                         case 'h':
12540                             if (memEQ(posixcc, "grap", 4)) /* graph */
12541                                 namedclass = ANYOF_GRAPH;
12542                             break;
12543                         case 'i':
12544                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12545                                 namedclass = ANYOF_ASCII;
12546                             break;
12547                         case 'k':
12548                             if (memEQ(posixcc, "blan", 4)) /* blank */
12549                                 namedclass = ANYOF_BLANK;
12550                             break;
12551                         case 'l':
12552                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12553                                 namedclass = ANYOF_CNTRL;
12554                             break;
12555                         case 'm':
12556                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12557                                 namedclass = ANYOF_ALPHANUMERIC;
12558                             break;
12559                         case 'r':
12560                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12561                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12562                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12563                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12564                             break;
12565                         case 't':
12566                             if (memEQ(posixcc, "digi", 4)) /* digit */
12567                                 namedclass = ANYOF_DIGIT;
12568                             else if (memEQ(posixcc, "prin", 4)) /* print */
12569                                 namedclass = ANYOF_PRINT;
12570                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12571                                 namedclass = ANYOF_PUNCT;
12572                             break;
12573                         }
12574                         break;
12575                     case 6:
12576                         if (memEQ(posixcc, "xdigit", 6))
12577                             namedclass = ANYOF_XDIGIT;
12578                         break;
12579                     }
12580
12581                     if (namedclass == OOB_NAMEDCLASS)
12582                         vFAIL2utf8f(
12583                             "POSIX class [:%"UTF8f":] unknown",
12584                             UTF8fARG(UTF, t - s - 1, s + 1));
12585
12586                     /* The #defines are structured so each complement is +1 to
12587                      * the normal one */
12588                     if (complement) {
12589                         namedclass++;
12590                     }
12591                     assert (posixcc[skip] == ':');
12592                     assert (posixcc[skip+1] == ']');
12593                 } else if (!SIZE_ONLY) {
12594                     /* [[=foo=]] and [[.foo.]] are still future. */
12595
12596                     /* adjust RExC_parse so the warning shows after
12597                        the class closes */
12598                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12599                         RExC_parse++;
12600                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12601                 }
12602             } else {
12603                 /* Maternal grandfather:
12604                  * "[:" ending in ":" but not in ":]" */
12605                 if (strict) {
12606                     vFAIL("Unmatched '[' in POSIX class");
12607                 }
12608
12609                 /* Grandfather lone [:, [=, [. */
12610                 RExC_parse = s;
12611             }
12612         }
12613     }
12614
12615     return namedclass;
12616 }
12617
12618 STATIC bool
12619 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12620 {
12621     /* This applies some heuristics at the current parse position (which should
12622      * be at a '[') to see if what follows might be intended to be a [:posix:]
12623      * class.  It returns true if it really is a posix class, of course, but it
12624      * also can return true if it thinks that what was intended was a posix
12625      * class that didn't quite make it.
12626      *
12627      * It will return true for
12628      *      [:alphanumerics:
12629      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12630      *                         ')' indicating the end of the (?[
12631      *      [:any garbage including %^&$ punctuation:]
12632      *
12633      * This is designed to be called only from S_handle_regex_sets; it could be
12634      * easily adapted to be called from the spot at the beginning of regclass()
12635      * that checks to see in a normal bracketed class if the surrounding []
12636      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12637      * change long-standing behavior, so I (khw) didn't do that */
12638     char* p = RExC_parse + 1;
12639     char first_char = *p;
12640
12641     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12642
12643     assert(*(p - 1) == '[');
12644
12645     if (! POSIXCC(first_char)) {
12646         return FALSE;
12647     }
12648
12649     p++;
12650     while (p < RExC_end && isWORDCHAR(*p)) p++;
12651
12652     if (p >= RExC_end) {
12653         return FALSE;
12654     }
12655
12656     if (p - RExC_parse > 2    /* Got at least 1 word character */
12657         && (*p == first_char
12658             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12659     {
12660         return TRUE;
12661     }
12662
12663     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12664
12665     return (p
12666             && p - RExC_parse > 2 /* [:] evaluates to colon;
12667                                       [::] is a bad posix class. */
12668             && first_char == *(p - 1));
12669 }
12670
12671 STATIC regnode *
12672 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12673                     I32 *flagp, U32 depth,
12674                     char * const oregcomp_parse)
12675 {
12676     /* Handle the (?[...]) construct to do set operations */
12677
12678     U8 curchar;
12679     UV start, end;      /* End points of code point ranges */
12680     SV* result_string;
12681     char *save_end, *save_parse;
12682     SV* final;
12683     STRLEN len;
12684     regnode* node;
12685     AV* stack;
12686     const bool save_fold = FOLD;
12687
12688     GET_RE_DEBUG_FLAGS_DECL;
12689
12690     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12691
12692     if (LOC) {
12693         vFAIL("(?[...]) not valid in locale");
12694     }
12695     RExC_uni_semantics = 1;
12696
12697     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12698      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12699      * call regclass to handle '[]' so as to not have to reinvent its parsing
12700      * rules here (throwing away the size it computes each time).  And, we exit
12701      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12702      * these things, we need to realize that something preceded by a backslash
12703      * is escaped, so we have to keep track of backslashes */
12704     if (SIZE_ONLY) {
12705         UV depth = 0; /* how many nested (?[...]) constructs */
12706
12707         Perl_ck_warner_d(aTHX_
12708             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12709             "The regex_sets feature is experimental" REPORT_LOCATION,
12710                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12711                 UTF8fARG(UTF,
12712                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12713                          RExC_precomp + (RExC_parse - RExC_precomp)));
12714
12715         while (RExC_parse < RExC_end) {
12716             SV* current = NULL;
12717             RExC_parse = regpatws(pRExC_state, RExC_parse,
12718                                 TRUE); /* means recognize comments */
12719             switch (*RExC_parse) {
12720                 case '?':
12721                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12722                     /* FALL THROUGH */
12723                 default:
12724                     break;
12725                 case '\\':
12726                     /* Skip the next byte (which could cause us to end up in
12727                      * the middle of a UTF-8 character, but since none of those
12728                      * are confusable with anything we currently handle in this
12729                      * switch (invariants all), it's safe.  We'll just hit the
12730                      * default: case next time and keep on incrementing until
12731                      * we find one of the invariants we do handle. */
12732                     RExC_parse++;
12733                     break;
12734                 case '[':
12735                 {
12736                     /* If this looks like it is a [:posix:] class, leave the
12737                      * parse pointer at the '[' to fool regclass() into
12738                      * thinking it is part of a '[[:posix:]]'.  That function
12739                      * will use strict checking to force a syntax error if it
12740                      * doesn't work out to a legitimate class */
12741                     bool is_posix_class
12742                                     = could_it_be_a_POSIX_class(pRExC_state);
12743                     if (! is_posix_class) {
12744                         RExC_parse++;
12745                     }
12746
12747                     /* regclass() can only return RESTART_UTF8 if multi-char
12748                        folds are allowed.  */
12749                     if (!regclass(pRExC_state, flagp,depth+1,
12750                                   is_posix_class, /* parse the whole char
12751                                                      class only if not a
12752                                                      posix class */
12753                                   FALSE, /* don't allow multi-char folds */
12754                                   TRUE, /* silence non-portable warnings. */
12755                                   &current))
12756                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12757                               (UV) *flagp);
12758
12759                     /* function call leaves parse pointing to the ']', except
12760                      * if we faked it */
12761                     if (is_posix_class) {
12762                         RExC_parse--;
12763                     }
12764
12765                     SvREFCNT_dec(current);   /* In case it returned something */
12766                     break;
12767                 }
12768
12769                 case ']':
12770                     if (depth--) break;
12771                     RExC_parse++;
12772                     if (RExC_parse < RExC_end
12773                         && *RExC_parse == ')')
12774                     {
12775                         node = reganode(pRExC_state, ANYOF, 0);
12776                         RExC_size += ANYOF_SKIP;
12777                         nextchar(pRExC_state);
12778                         Set_Node_Length(node,
12779                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12780                         return node;
12781                     }
12782                     goto no_close;
12783             }
12784             RExC_parse++;
12785         }
12786
12787         no_close:
12788         FAIL("Syntax error in (?[...])");
12789     }
12790
12791     /* Pass 2 only after this.  Everything in this construct is a
12792      * metacharacter.  Operands begin with either a '\' (for an escape
12793      * sequence), or a '[' for a bracketed character class.  Any other
12794      * character should be an operator, or parenthesis for grouping.  Both
12795      * types of operands are handled by calling regclass() to parse them.  It
12796      * is called with a parameter to indicate to return the computed inversion
12797      * list.  The parsing here is implemented via a stack.  Each entry on the
12798      * stack is a single character representing one of the operators, or the
12799      * '('; or else a pointer to an operand inversion list. */
12800
12801 #define IS_OPERAND(a)  (! SvIOK(a))
12802
12803     /* The stack starts empty.  It is a syntax error if the first thing parsed
12804      * is a binary operator; everything else is pushed on the stack.  When an
12805      * operand is parsed, the top of the stack is examined.  If it is a binary
12806      * operator, the item before it should be an operand, and both are replaced
12807      * by the result of doing that operation on the new operand and the one on
12808      * the stack.   Thus a sequence of binary operands is reduced to a single
12809      * one before the next one is parsed.
12810      *
12811      * A unary operator may immediately follow a binary in the input, for
12812      * example
12813      *      [a] + ! [b]
12814      * When an operand is parsed and the top of the stack is a unary operator,
12815      * the operation is performed, and then the stack is rechecked to see if
12816      * this new operand is part of a binary operation; if so, it is handled as
12817      * above.
12818      *
12819      * A '(' is simply pushed on the stack; it is valid only if the stack is
12820      * empty, or the top element of the stack is an operator or another '('
12821      * (for which the parenthesized expression will become an operand).  By the
12822      * time the corresponding ')' is parsed everything in between should have
12823      * been parsed and evaluated to a single operand (or else is a syntax
12824      * error), and is handled as a regular operand */
12825
12826     sv_2mortal((SV *)(stack = newAV()));
12827
12828     while (RExC_parse < RExC_end) {
12829         I32 top_index = av_tindex(stack);
12830         SV** top_ptr;
12831         SV* current = NULL;
12832
12833         /* Skip white space */
12834         RExC_parse = regpatws(pRExC_state, RExC_parse,
12835                                 TRUE); /* means recognize comments */
12836         if (RExC_parse >= RExC_end) {
12837             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12838         }
12839         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12840             break;
12841         }
12842
12843         switch (curchar) {
12844
12845             case '?':
12846                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12847                                                safely subtract 1 from
12848                                                RExC_parse in the next clause.
12849                                                If we have something on the
12850                                                stack, we have parsed something
12851                                              */
12852                     && UCHARAT(RExC_parse - 1) == '('
12853                     && RExC_parse < RExC_end)
12854                 {
12855                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12856                      * This happens when we have some thing like
12857                      *
12858                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12859                      *   ...
12860                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12861                      *
12862                      * Here we would be handling the interpolated
12863                      * '$thai_or_lao'.  We handle this by a recursive call to
12864                      * ourselves which returns the inversion list the
12865                      * interpolated expression evaluates to.  We use the flags
12866                      * from the interpolated pattern. */
12867                     U32 save_flags = RExC_flags;
12868                     const char * const save_parse = ++RExC_parse;
12869
12870                     parse_lparen_question_flags(pRExC_state);
12871
12872                     if (RExC_parse == save_parse  /* Makes sure there was at
12873                                                      least one flag (or this
12874                                                      embedding wasn't compiled)
12875                                                    */
12876                         || RExC_parse >= RExC_end - 4
12877                         || UCHARAT(RExC_parse) != ':'
12878                         || UCHARAT(++RExC_parse) != '('
12879                         || UCHARAT(++RExC_parse) != '?'
12880                         || UCHARAT(++RExC_parse) != '[')
12881                     {
12882
12883                         /* In combination with the above, this moves the
12884                          * pointer to the point just after the first erroneous
12885                          * character (or if there are no flags, to where they
12886                          * should have been) */
12887                         if (RExC_parse >= RExC_end - 4) {
12888                             RExC_parse = RExC_end;
12889                         }
12890                         else if (RExC_parse != save_parse) {
12891                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12892                         }
12893                         vFAIL("Expecting '(?flags:(?[...'");
12894                     }
12895                     RExC_parse++;
12896                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12897                                                     depth+1, oregcomp_parse);
12898
12899                     /* Here, 'current' contains the embedded expression's
12900                      * inversion list, and RExC_parse points to the trailing
12901                      * ']'; the next character should be the ')' which will be
12902                      * paired with the '(' that has been put on the stack, so
12903                      * the whole embedded expression reduces to '(operand)' */
12904                     RExC_parse++;
12905
12906                     RExC_flags = save_flags;
12907                     goto handle_operand;
12908                 }
12909                 /* FALL THROUGH */
12910
12911             default:
12912                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12913                 vFAIL("Unexpected character");
12914
12915             case '\\':
12916                 /* regclass() can only return RESTART_UTF8 if multi-char
12917                    folds are allowed.  */
12918                 if (!regclass(pRExC_state, flagp,depth+1,
12919                               TRUE, /* means parse just the next thing */
12920                               FALSE, /* don't allow multi-char folds */
12921                               FALSE, /* don't silence non-portable warnings.  */
12922                               &current))
12923                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12924                           (UV) *flagp);
12925                 /* regclass() will return with parsing just the \ sequence,
12926                  * leaving the parse pointer at the next thing to parse */
12927                 RExC_parse--;
12928                 goto handle_operand;
12929
12930             case '[':   /* Is a bracketed character class */
12931             {
12932                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12933
12934                 if (! is_posix_class) {
12935                     RExC_parse++;
12936                 }
12937
12938                 /* regclass() can only return RESTART_UTF8 if multi-char
12939                    folds are allowed.  */
12940                 if(!regclass(pRExC_state, flagp,depth+1,
12941                              is_posix_class, /* parse the whole char class
12942                                                 only if not a posix class */
12943                              FALSE, /* don't allow multi-char folds */
12944                              FALSE, /* don't silence non-portable warnings.  */
12945                              &current))
12946                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12947                           (UV) *flagp);
12948                 /* function call leaves parse pointing to the ']', except if we
12949                  * faked it */
12950                 if (is_posix_class) {
12951                     RExC_parse--;
12952                 }
12953
12954                 goto handle_operand;
12955             }
12956
12957             case '&':
12958             case '|':
12959             case '+':
12960             case '-':
12961             case '^':
12962                 if (top_index < 0
12963                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12964                     || ! IS_OPERAND(*top_ptr))
12965                 {
12966                     RExC_parse++;
12967                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12968                 }
12969                 av_push(stack, newSVuv(curchar));
12970                 break;
12971
12972             case '!':
12973                 av_push(stack, newSVuv(curchar));
12974                 break;
12975
12976             case '(':
12977                 if (top_index >= 0) {
12978                     top_ptr = av_fetch(stack, top_index, FALSE);
12979                     assert(top_ptr);
12980                     if (IS_OPERAND(*top_ptr)) {
12981                         RExC_parse++;
12982                         vFAIL("Unexpected '(' with no preceding operator");
12983                     }
12984                 }
12985                 av_push(stack, newSVuv(curchar));
12986                 break;
12987
12988             case ')':
12989             {
12990                 SV* lparen;
12991                 if (top_index < 1
12992                     || ! (current = av_pop(stack))
12993                     || ! IS_OPERAND(current)
12994                     || ! (lparen = av_pop(stack))
12995                     || IS_OPERAND(lparen)
12996                     || SvUV(lparen) != '(')
12997                 {
12998                     SvREFCNT_dec(current);
12999                     RExC_parse++;
13000                     vFAIL("Unexpected ')'");
13001                 }
13002                 top_index -= 2;
13003                 SvREFCNT_dec_NN(lparen);
13004
13005                 /* FALL THROUGH */
13006             }
13007
13008               handle_operand:
13009
13010                 /* Here, we have an operand to process, in 'current' */
13011
13012                 if (top_index < 0) {    /* Just push if stack is empty */
13013                     av_push(stack, current);
13014                 }
13015                 else {
13016                     SV* top = av_pop(stack);
13017                     SV *prev = NULL;
13018                     char current_operator;
13019
13020                     if (IS_OPERAND(top)) {
13021                         SvREFCNT_dec_NN(top);
13022                         SvREFCNT_dec_NN(current);
13023                         vFAIL("Operand with no preceding operator");
13024                     }
13025                     current_operator = (char) SvUV(top);
13026                     switch (current_operator) {
13027                         case '(':   /* Push the '(' back on followed by the new
13028                                        operand */
13029                             av_push(stack, top);
13030                             av_push(stack, current);
13031                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13032                                                    just after the 'break', so
13033                                                    it doesn't get wrongly freed
13034                                                  */
13035                             break;
13036
13037                         case '!':
13038                             _invlist_invert(current);
13039
13040                             /* Unlike binary operators, the top of the stack,
13041                              * now that this unary one has been popped off, may
13042                              * legally be an operator, and we now have operand
13043                              * for it. */
13044                             top_index--;
13045                             SvREFCNT_dec_NN(top);
13046                             goto handle_operand;
13047
13048                         case '&':
13049                             prev = av_pop(stack);
13050                             _invlist_intersection(prev,
13051                                                    current,
13052                                                    &current);
13053                             av_push(stack, current);
13054                             break;
13055
13056                         case '|':
13057                         case '+':
13058                             prev = av_pop(stack);
13059                             _invlist_union(prev, current, &current);
13060                             av_push(stack, current);
13061                             break;
13062
13063                         case '-':
13064                             prev = av_pop(stack);;
13065                             _invlist_subtract(prev, current, &current);
13066                             av_push(stack, current);
13067                             break;
13068
13069                         case '^':   /* The union minus the intersection */
13070                         {
13071                             SV* i = NULL;
13072                             SV* u = NULL;
13073                             SV* element;
13074
13075                             prev = av_pop(stack);
13076                             _invlist_union(prev, current, &u);
13077                             _invlist_intersection(prev, current, &i);
13078                             /* _invlist_subtract will overwrite current
13079                                 without freeing what it already contains */
13080                             element = current;
13081                             _invlist_subtract(u, i, &current);
13082                             av_push(stack, current);
13083                             SvREFCNT_dec_NN(i);
13084                             SvREFCNT_dec_NN(u);
13085                             SvREFCNT_dec_NN(element);
13086                             break;
13087                         }
13088
13089                         default:
13090                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13091                 }
13092                 SvREFCNT_dec_NN(top);
13093                 SvREFCNT_dec(prev);
13094             }
13095         }
13096
13097         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13098     }
13099
13100     if (av_tindex(stack) < 0   /* Was empty */
13101         || ((final = av_pop(stack)) == NULL)
13102         || ! IS_OPERAND(final)
13103         || av_tindex(stack) >= 0)  /* More left on stack */
13104     {
13105         vFAIL("Incomplete expression within '(?[ ])'");
13106     }
13107
13108     /* Here, 'final' is the resultant inversion list from evaluating the
13109      * expression.  Return it if so requested */
13110     if (return_invlist) {
13111         *return_invlist = final;
13112         return END;
13113     }
13114
13115     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13116      * expecting a string of ranges and individual code points */
13117     invlist_iterinit(final);
13118     result_string = newSVpvs("");
13119     while (invlist_iternext(final, &start, &end)) {
13120         if (start == end) {
13121             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13122         }
13123         else {
13124             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13125                                                      start,          end);
13126         }
13127     }
13128
13129     save_parse = RExC_parse;
13130     RExC_parse = SvPV(result_string, len);
13131     save_end = RExC_end;
13132     RExC_end = RExC_parse + len;
13133
13134     /* We turn off folding around the call, as the class we have constructed
13135      * already has all folding taken into consideration, and we don't want
13136      * regclass() to add to that */
13137     RExC_flags &= ~RXf_PMf_FOLD;
13138     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13139      */
13140     node = regclass(pRExC_state, flagp,depth+1,
13141                     FALSE, /* means parse the whole char class */
13142                     FALSE, /* don't allow multi-char folds */
13143                     TRUE, /* silence non-portable warnings.  The above may very
13144                              well have generated non-portable code points, but
13145                              they're valid on this machine */
13146                     NULL);
13147     if (!node)
13148         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13149                     PTR2UV(flagp));
13150     if (save_fold) {
13151         RExC_flags |= RXf_PMf_FOLD;
13152     }
13153     RExC_parse = save_parse + 1;
13154     RExC_end = save_end;
13155     SvREFCNT_dec_NN(final);
13156     SvREFCNT_dec_NN(result_string);
13157
13158     nextchar(pRExC_state);
13159     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13160     return node;
13161 }
13162 #undef IS_OPERAND
13163
13164 /* The names of properties whose definitions are not known at compile time are
13165  * stored in this SV, after a constant heading.  So if the length has been
13166  * changed since initialization, then there is a run-time definition. */
13167 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13168                                         (SvCUR(listsv) != initial_listsv_len)
13169
13170 STATIC regnode *
13171 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13172                  const bool stop_at_1,  /* Just parse the next thing, don't
13173                                            look for a full character class */
13174                  bool allow_multi_folds,
13175                  const bool silence_non_portable,   /* Don't output warnings
13176                                                        about too large
13177                                                        characters */
13178                  SV** ret_invlist)  /* Return an inversion list, not a node */
13179 {
13180     /* parse a bracketed class specification.  Most of these will produce an
13181      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13182      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13183      * under /i with multi-character folds: it will be rewritten following the
13184      * paradigm of this example, where the <multi-fold>s are characters which
13185      * fold to multiple character sequences:
13186      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13187      * gets effectively rewritten as:
13188      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13189      * reg() gets called (recursively) on the rewritten version, and this
13190      * function will return what it constructs.  (Actually the <multi-fold>s
13191      * aren't physically removed from the [abcdefghi], it's just that they are
13192      * ignored in the recursion by means of a flag:
13193      * <RExC_in_multi_char_class>.)
13194      *
13195      * ANYOF nodes contain a bit map for the first 256 characters, with the
13196      * corresponding bit set if that character is in the list.  For characters
13197      * above 255, a range list or swash is used.  There are extra bits for \w,
13198      * etc. in locale ANYOFs, as what these match is not determinable at
13199      * compile time
13200      *
13201      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13202      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13203      */
13204
13205     dVAR;
13206     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13207     IV range = 0;
13208     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13209     regnode *ret;
13210     STRLEN numlen;
13211     IV namedclass = OOB_NAMEDCLASS;
13212     char *rangebegin = NULL;
13213     bool need_class = 0;
13214     SV *listsv = NULL;
13215     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13216                                       than just initialized.  */
13217     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13218     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13219                                extended beyond the Latin1 range.  These have to
13220                                be kept separate from other code points for much
13221                                of this function because their handling  is
13222                                different under /i, and for most classes under
13223                                /d as well */
13224     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13225                                separate for a while from the non-complemented
13226                                versions because of complications with /d
13227                                matching */
13228     UV element_count = 0;   /* Number of distinct elements in the class.
13229                                Optimizations may be possible if this is tiny */
13230     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13231                                        character; used under /i */
13232     UV n;
13233     char * stop_ptr = RExC_end;    /* where to stop parsing */
13234     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13235                                                    space? */
13236     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13237
13238     /* Unicode properties are stored in a swash; this holds the current one
13239      * being parsed.  If this swash is the only above-latin1 component of the
13240      * character class, an optimization is to pass it directly on to the
13241      * execution engine.  Otherwise, it is set to NULL to indicate that there
13242      * are other things in the class that have to be dealt with at execution
13243      * time */
13244     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13245
13246     /* Set if a component of this character class is user-defined; just passed
13247      * on to the engine */
13248     bool has_user_defined_property = FALSE;
13249
13250     /* inversion list of code points this node matches only when the target
13251      * string is in UTF-8.  (Because is under /d) */
13252     SV* depends_list = NULL;
13253
13254     /* Inversion list of code points this node matches regardless of things
13255      * like locale, folding, utf8ness of the target string */
13256     SV* cp_list = NULL;
13257
13258     /* Like cp_list, but code points on this list need to be checked for things
13259      * that fold to/from them under /i */
13260     SV* cp_foldable_list = NULL;
13261
13262     /* Like cp_list, but code points on this list are valid only when the
13263      * runtime locale is UTF-8 */
13264     SV* only_utf8_locale_list = NULL;
13265
13266 #ifdef EBCDIC
13267     /* In a range, counts how many 0-2 of the ends of it came from literals,
13268      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13269     UV literal_endpoint = 0;
13270 #endif
13271     bool invert = FALSE;    /* Is this class to be complemented */
13272
13273     bool warn_super = ALWAYS_WARN_SUPER;
13274
13275     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13276         case we need to change the emitted regop to an EXACT. */
13277     const char * orig_parse = RExC_parse;
13278     const SSize_t orig_size = RExC_size;
13279     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13280     GET_RE_DEBUG_FLAGS_DECL;
13281
13282     PERL_ARGS_ASSERT_REGCLASS;
13283 #ifndef DEBUGGING
13284     PERL_UNUSED_ARG(depth);
13285 #endif
13286
13287     DEBUG_PARSE("clas");
13288
13289     /* Assume we are going to generate an ANYOF node. */
13290     ret = reganode(pRExC_state, ANYOF, 0);
13291
13292     if (SIZE_ONLY) {
13293         RExC_size += ANYOF_SKIP;
13294         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13295     }
13296     else {
13297         ANYOF_FLAGS(ret) = 0;
13298
13299         RExC_emit += ANYOF_SKIP;
13300         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13301         initial_listsv_len = SvCUR(listsv);
13302         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13303     }
13304
13305     if (skip_white) {
13306         RExC_parse = regpatws(pRExC_state, RExC_parse,
13307                               FALSE /* means don't recognize comments */);
13308     }
13309
13310     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13311         RExC_parse++;
13312         invert = TRUE;
13313         allow_multi_folds = FALSE;
13314         RExC_naughty++;
13315         if (skip_white) {
13316             RExC_parse = regpatws(pRExC_state, RExC_parse,
13317                                   FALSE /* means don't recognize comments */);
13318         }
13319     }
13320
13321     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13322     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13323         const char *s = RExC_parse;
13324         const char  c = *s++;
13325
13326         while (isWORDCHAR(*s))
13327             s++;
13328         if (*s && c == *s && s[1] == ']') {
13329             SAVEFREESV(RExC_rx_sv);
13330             ckWARN3reg(s+2,
13331                        "POSIX syntax [%c %c] belongs inside character classes",
13332                        c, c);
13333             (void)ReREFCNT_inc(RExC_rx_sv);
13334         }
13335     }
13336
13337     /* If the caller wants us to just parse a single element, accomplish this
13338      * by faking the loop ending condition */
13339     if (stop_at_1 && RExC_end > RExC_parse) {
13340         stop_ptr = RExC_parse + 1;
13341     }
13342
13343     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13344     if (UCHARAT(RExC_parse) == ']')
13345         goto charclassloop;
13346
13347 parseit:
13348     while (1) {
13349         if  (RExC_parse >= stop_ptr) {
13350             break;
13351         }
13352
13353         if (skip_white) {
13354             RExC_parse = regpatws(pRExC_state, RExC_parse,
13355                                   FALSE /* means don't recognize comments */);
13356         }
13357
13358         if  (UCHARAT(RExC_parse) == ']') {
13359             break;
13360         }
13361
13362     charclassloop:
13363
13364         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13365         save_value = value;
13366         save_prevvalue = prevvalue;
13367
13368         if (!range) {
13369             rangebegin = RExC_parse;
13370             element_count++;
13371         }
13372         if (UTF) {
13373             value = utf8n_to_uvchr((U8*)RExC_parse,
13374                                    RExC_end - RExC_parse,
13375                                    &numlen, UTF8_ALLOW_DEFAULT);
13376             RExC_parse += numlen;
13377         }
13378         else
13379             value = UCHARAT(RExC_parse++);
13380
13381         if (value == '['
13382             && RExC_parse < RExC_end
13383             && POSIXCC(UCHARAT(RExC_parse)))
13384         {
13385             namedclass = regpposixcc(pRExC_state, value, strict);
13386         }
13387         else if (value == '\\') {
13388             if (UTF) {
13389                 value = utf8n_to_uvchr((U8*)RExC_parse,
13390                                    RExC_end - RExC_parse,
13391                                    &numlen, UTF8_ALLOW_DEFAULT);
13392                 RExC_parse += numlen;
13393             }
13394             else
13395                 value = UCHARAT(RExC_parse++);
13396
13397             /* Some compilers cannot handle switching on 64-bit integer
13398              * values, therefore value cannot be an UV.  Yes, this will
13399              * be a problem later if we want switch on Unicode.
13400              * A similar issue a little bit later when switching on
13401              * namedclass. --jhi */
13402
13403             /* If the \ is escaping white space when white space is being
13404              * skipped, it means that that white space is wanted literally, and
13405              * is already in 'value'.  Otherwise, need to translate the escape
13406              * into what it signifies. */
13407             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13408
13409             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13410             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13411             case 's':   namedclass = ANYOF_SPACE;       break;
13412             case 'S':   namedclass = ANYOF_NSPACE;      break;
13413             case 'd':   namedclass = ANYOF_DIGIT;       break;
13414             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13415             case 'v':   namedclass = ANYOF_VERTWS;      break;
13416             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13417             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13418             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13419             case 'N':  /* Handle \N{NAME} in class */
13420                 {
13421                     /* We only pay attention to the first char of
13422                     multichar strings being returned. I kinda wonder
13423                     if this makes sense as it does change the behaviour
13424                     from earlier versions, OTOH that behaviour was broken
13425                     as well. */
13426                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13427                                       TRUE, /* => charclass */
13428                                       strict))
13429                     {
13430                         if (*flagp & RESTART_UTF8)
13431                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13432                         goto parseit;
13433                     }
13434                 }
13435                 break;
13436             case 'p':
13437             case 'P':
13438                 {
13439                 char *e;
13440
13441                 /* We will handle any undefined properties ourselves */
13442                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13443                                        /* And we actually would prefer to get
13444                                         * the straight inversion list of the
13445                                         * swash, since we will be accessing it
13446                                         * anyway, to save a little time */
13447                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13448
13449                 if (RExC_parse >= RExC_end)
13450                     vFAIL2("Empty \\%c{}", (U8)value);
13451                 if (*RExC_parse == '{') {
13452                     const U8 c = (U8)value;
13453                     e = strchr(RExC_parse++, '}');
13454                     if (!e)
13455                         vFAIL2("Missing right brace on \\%c{}", c);
13456                     while (isSPACE(UCHARAT(RExC_parse)))
13457                         RExC_parse++;
13458                     if (e == RExC_parse)
13459                         vFAIL2("Empty \\%c{}", c);
13460                     n = e - RExC_parse;
13461                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13462                         n--;
13463                 }
13464                 else {
13465                     e = RExC_parse;
13466                     n = 1;
13467                 }
13468                 if (!SIZE_ONLY) {
13469                     SV* invlist;
13470                     char* formatted;
13471                     char* name;
13472
13473                     if (UCHARAT(RExC_parse) == '^') {
13474                          RExC_parse++;
13475                          n--;
13476                          /* toggle.  (The rhs xor gets the single bit that
13477                           * differs between P and p; the other xor inverts just
13478                           * that bit) */
13479                          value ^= 'P' ^ 'p';
13480
13481                          while (isSPACE(UCHARAT(RExC_parse))) {
13482                               RExC_parse++;
13483                               n--;
13484                          }
13485                     }
13486                     /* Try to get the definition of the property into
13487                      * <invlist>.  If /i is in effect, the effective property
13488                      * will have its name be <__NAME_i>.  The design is
13489                      * discussed in commit
13490                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13491                     formatted = Perl_form(aTHX_
13492                                           "%s%.*s%s\n",
13493                                           (FOLD) ? "__" : "",
13494                                           (int)n,
13495                                           RExC_parse,
13496                                           (FOLD) ? "_i" : ""
13497                                 );
13498                     name = savepvn(formatted, strlen(formatted));
13499
13500                     /* Look up the property name, and get its swash and
13501                      * inversion list, if the property is found  */
13502                     if (swash) {
13503                         SvREFCNT_dec_NN(swash);
13504                     }
13505                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13506                                              1, /* binary */
13507                                              0, /* not tr/// */
13508                                              NULL, /* No inversion list */
13509                                              &swash_init_flags
13510                                             );
13511                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13512                         if (swash) {
13513                             SvREFCNT_dec_NN(swash);
13514                             swash = NULL;
13515                         }
13516
13517                         /* Here didn't find it.  It could be a user-defined
13518                          * property that will be available at run-time.  If we
13519                          * accept only compile-time properties, is an error;
13520                          * otherwise add it to the list for run-time look up */
13521                         if (ret_invlist) {
13522                             RExC_parse = e + 1;
13523                             vFAIL2utf8f(
13524                                 "Property '%"UTF8f"' is unknown",
13525                                 UTF8fARG(UTF, n, name));
13526                         }
13527                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13528                                         (value == 'p' ? '+' : '!'),
13529                                         UTF8fARG(UTF, n, name));
13530                         has_user_defined_property = TRUE;
13531
13532                         /* We don't know yet, so have to assume that the
13533                          * property could match something in the Latin1 range,
13534                          * hence something that isn't utf8.  Note that this
13535                          * would cause things in <depends_list> to match
13536                          * inappropriately, except that any \p{}, including
13537                          * this one forces Unicode semantics, which means there
13538                          * is no <depends_list> */
13539                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13540                     }
13541                     else {
13542
13543                         /* Here, did get the swash and its inversion list.  If
13544                          * the swash is from a user-defined property, then this
13545                          * whole character class should be regarded as such */
13546                         if (swash_init_flags
13547                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13548                         {
13549                             has_user_defined_property = TRUE;
13550                         }
13551                         else if
13552                             /* We warn on matching an above-Unicode code point
13553                              * if the match would return true, except don't
13554                              * warn for \p{All}, which has exactly one element
13555                              * = 0 */
13556                             (_invlist_contains_cp(invlist, 0x110000)
13557                                 && (! (_invlist_len(invlist) == 1
13558                                        && *invlist_array(invlist) == 0)))
13559                         {
13560                             warn_super = TRUE;
13561                         }
13562
13563
13564                         /* Invert if asking for the complement */
13565                         if (value == 'P') {
13566                             _invlist_union_complement_2nd(properties,
13567                                                           invlist,
13568                                                           &properties);
13569
13570                             /* The swash can't be used as-is, because we've
13571                              * inverted things; delay removing it to here after
13572                              * have copied its invlist above */
13573                             SvREFCNT_dec_NN(swash);
13574                             swash = NULL;
13575                         }
13576                         else {
13577                             _invlist_union(properties, invlist, &properties);
13578                         }
13579                     }
13580                     Safefree(name);
13581                 }
13582                 RExC_parse = e + 1;
13583                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13584                                                 named */
13585
13586                 /* \p means they want Unicode semantics */
13587                 RExC_uni_semantics = 1;
13588                 }
13589                 break;
13590             case 'n':   value = '\n';                   break;
13591             case 'r':   value = '\r';                   break;
13592             case 't':   value = '\t';                   break;
13593             case 'f':   value = '\f';                   break;
13594             case 'b':   value = '\b';                   break;
13595             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13596             case 'a':   value = '\a';                   break;
13597             case 'o':
13598                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13599                 {
13600                     const char* error_msg;
13601                     bool valid = grok_bslash_o(&RExC_parse,
13602                                                &value,
13603                                                &error_msg,
13604                                                SIZE_ONLY,   /* warnings in pass
13605                                                                1 only */
13606                                                strict,
13607                                                silence_non_portable,
13608                                                UTF);
13609                     if (! valid) {
13610                         vFAIL(error_msg);
13611                     }
13612                 }
13613                 if (PL_encoding && value < 0x100) {
13614                     goto recode_encoding;
13615                 }
13616                 break;
13617             case 'x':
13618                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13619                 {
13620                     const char* error_msg;
13621                     bool valid = grok_bslash_x(&RExC_parse,
13622                                                &value,
13623                                                &error_msg,
13624                                                TRUE, /* Output warnings */
13625                                                strict,
13626                                                silence_non_portable,
13627                                                UTF);
13628                     if (! valid) {
13629                         vFAIL(error_msg);
13630                     }
13631                 }
13632                 if (PL_encoding && value < 0x100)
13633                     goto recode_encoding;
13634                 break;
13635             case 'c':
13636                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13637                 break;
13638             case '0': case '1': case '2': case '3': case '4':
13639             case '5': case '6': case '7':
13640                 {
13641                     /* Take 1-3 octal digits */
13642                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13643                     numlen = (strict) ? 4 : 3;
13644                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13645                     RExC_parse += numlen;
13646                     if (numlen != 3) {
13647                         if (strict) {
13648                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13649                             vFAIL("Need exactly 3 octal digits");
13650                         }
13651                         else if (! SIZE_ONLY /* like \08, \178 */
13652                                  && numlen < 3
13653                                  && RExC_parse < RExC_end
13654                                  && isDIGIT(*RExC_parse)
13655                                  && ckWARN(WARN_REGEXP))
13656                         {
13657                             SAVEFREESV(RExC_rx_sv);
13658                             reg_warn_non_literal_string(
13659                                  RExC_parse + 1,
13660                                  form_short_octal_warning(RExC_parse, numlen));
13661                             (void)ReREFCNT_inc(RExC_rx_sv);
13662                         }
13663                     }
13664                     if (PL_encoding && value < 0x100)
13665                         goto recode_encoding;
13666                     break;
13667                 }
13668             recode_encoding:
13669                 if (! RExC_override_recoding) {
13670                     SV* enc = PL_encoding;
13671                     value = reg_recode((const char)(U8)value, &enc);
13672                     if (!enc) {
13673                         if (strict) {
13674                             vFAIL("Invalid escape in the specified encoding");
13675                         }
13676                         else if (SIZE_ONLY) {
13677                             ckWARNreg(RExC_parse,
13678                                   "Invalid escape in the specified encoding");
13679                         }
13680                     }
13681                     break;
13682                 }
13683             default:
13684                 /* Allow \_ to not give an error */
13685                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13686                     if (strict) {
13687                         vFAIL2("Unrecognized escape \\%c in character class",
13688                                (int)value);
13689                     }
13690                     else {
13691                         SAVEFREESV(RExC_rx_sv);
13692                         ckWARN2reg(RExC_parse,
13693                             "Unrecognized escape \\%c in character class passed through",
13694                             (int)value);
13695                         (void)ReREFCNT_inc(RExC_rx_sv);
13696                     }
13697                 }
13698                 break;
13699             }   /* End of switch on char following backslash */
13700         } /* end of handling backslash escape sequences */
13701 #ifdef EBCDIC
13702         else
13703             literal_endpoint++;
13704 #endif
13705
13706         /* Here, we have the current token in 'value' */
13707
13708         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13709             U8 classnum;
13710
13711             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13712              * literal, as is the character that began the false range, i.e.
13713              * the 'a' in the examples */
13714             if (range) {
13715                 if (!SIZE_ONLY) {
13716                     const int w = (RExC_parse >= rangebegin)
13717                                   ? RExC_parse - rangebegin
13718                                   : 0;
13719                     if (strict) {
13720                         vFAIL2utf8f(
13721                             "False [] range \"%"UTF8f"\"",
13722                             UTF8fARG(UTF, w, rangebegin));
13723                     }
13724                     else {
13725                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13726                         ckWARN2reg(RExC_parse,
13727                             "False [] range \"%"UTF8f"\"",
13728                             UTF8fARG(UTF, w, rangebegin));
13729                         (void)ReREFCNT_inc(RExC_rx_sv);
13730                         cp_list = add_cp_to_invlist(cp_list, '-');
13731                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13732                                                              prevvalue);
13733                     }
13734                 }
13735
13736                 range = 0; /* this was not a true range */
13737                 element_count += 2; /* So counts for three values */
13738             }
13739
13740             classnum = namedclass_to_classnum(namedclass);
13741
13742             if (LOC && namedclass < ANYOF_POSIXL_MAX
13743 #ifndef HAS_ISASCII
13744                 && classnum != _CC_ASCII
13745 #endif
13746             ) {
13747                 /* What the Posix classes (like \w, [:space:]) match in locale
13748                  * isn't knowable under locale until actual match time.  Room
13749                  * must be reserved (one time per outer bracketed class) to
13750                  * store such classes.  The space will contain a bit for each
13751                  * named class that is to be matched against.  This isn't
13752                  * needed for \p{} and pseudo-classes, as they are not affected
13753                  * by locale, and hence are dealt with separately */
13754                 if (! need_class) {
13755                     need_class = 1;
13756                     if (SIZE_ONLY) {
13757                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13758                     }
13759                     else {
13760                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13761                     }
13762                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13763                     ANYOF_POSIXL_ZERO(ret);
13764                 }
13765
13766                 /* See if it already matches the complement of this POSIX
13767                  * class */
13768                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13769                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13770                                                             ? -1
13771                                                             : 1)))
13772                 {
13773                     posixl_matches_all = TRUE;
13774                     break;  /* No need to continue.  Since it matches both
13775                                e.g., \w and \W, it matches everything, and the
13776                                bracketed class can be optimized into qr/./s */
13777                 }
13778
13779                 /* Add this class to those that should be checked at runtime */
13780                 ANYOF_POSIXL_SET(ret, namedclass);
13781
13782                 /* The above-Latin1 characters are not subject to locale rules.
13783                  * Just add them, in the second pass, to the
13784                  * unconditionally-matched list */
13785                 if (! SIZE_ONLY) {
13786                     SV* scratch_list = NULL;
13787
13788                     /* Get the list of the above-Latin1 code points this
13789                      * matches */
13790                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13791                                           PL_XPosix_ptrs[classnum],
13792
13793                                           /* Odd numbers are complements, like
13794                                            * NDIGIT, NASCII, ... */
13795                                           namedclass % 2 != 0,
13796                                           &scratch_list);
13797                     /* Checking if 'cp_list' is NULL first saves an extra
13798                      * clone.  Its reference count will be decremented at the
13799                      * next union, etc, or if this is the only instance, at the
13800                      * end of the routine */
13801                     if (! cp_list) {
13802                         cp_list = scratch_list;
13803                     }
13804                     else {
13805                         _invlist_union(cp_list, scratch_list, &cp_list);
13806                         SvREFCNT_dec_NN(scratch_list);
13807                     }
13808                     continue;   /* Go get next character */
13809                 }
13810             }
13811             else if (! SIZE_ONLY) {
13812
13813                 /* Here, not in pass1 (in that pass we skip calculating the
13814                  * contents of this class), and is /l, or is a POSIX class for
13815                  * which /l doesn't matter (or is a Unicode property, which is
13816                  * skipped here). */
13817                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13818                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13819
13820                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13821                          * nor /l make a difference in what these match,
13822                          * therefore we just add what they match to cp_list. */
13823                         if (classnum != _CC_VERTSPACE) {
13824                             assert(   namedclass == ANYOF_HORIZWS
13825                                    || namedclass == ANYOF_NHORIZWS);
13826
13827                             /* It turns out that \h is just a synonym for
13828                              * XPosixBlank */
13829                             classnum = _CC_BLANK;
13830                         }
13831
13832                         _invlist_union_maybe_complement_2nd(
13833                                 cp_list,
13834                                 PL_XPosix_ptrs[classnum],
13835                                 namedclass % 2 != 0,    /* Complement if odd
13836                                                           (NHORIZWS, NVERTWS)
13837                                                         */
13838                                 &cp_list);
13839                     }
13840                 }
13841                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13842                            complement and use nposixes */
13843                     SV** posixes_ptr = namedclass % 2 == 0
13844                                        ? &posixes
13845                                        : &nposixes;
13846                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13847                     _invlist_union_maybe_complement_2nd(
13848                                                      *posixes_ptr,
13849                                                      *source_ptr,
13850                                                      namedclass % 2 != 0,
13851                                                      posixes_ptr);
13852                 }
13853                 continue;   /* Go get next character */
13854             }
13855         } /* end of namedclass \blah */
13856
13857         /* Here, we have a single value.  If 'range' is set, it is the ending
13858          * of a range--check its validity.  Later, we will handle each
13859          * individual code point in the range.  If 'range' isn't set, this
13860          * could be the beginning of a range, so check for that by looking
13861          * ahead to see if the next real character to be processed is the range
13862          * indicator--the minus sign */
13863
13864         if (skip_white) {
13865             RExC_parse = regpatws(pRExC_state, RExC_parse,
13866                                 FALSE /* means don't recognize comments */);
13867         }
13868
13869         if (range) {
13870             if (prevvalue > value) /* b-a */ {
13871                 const int w = RExC_parse - rangebegin;
13872                 vFAIL2utf8f(
13873                     "Invalid [] range \"%"UTF8f"\"",
13874                     UTF8fARG(UTF, w, rangebegin));
13875                 range = 0; /* not a valid range */
13876             }
13877         }
13878         else {
13879             prevvalue = value; /* save the beginning of the potential range */
13880             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13881                 && *RExC_parse == '-')
13882             {
13883                 char* next_char_ptr = RExC_parse + 1;
13884                 if (skip_white) {   /* Get the next real char after the '-' */
13885                     next_char_ptr = regpatws(pRExC_state,
13886                                              RExC_parse + 1,
13887                                              FALSE); /* means don't recognize
13888                                                         comments */
13889                 }
13890
13891                 /* If the '-' is at the end of the class (just before the ']',
13892                  * it is a literal minus; otherwise it is a range */
13893                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13894                     RExC_parse = next_char_ptr;
13895
13896                     /* a bad range like \w-, [:word:]- ? */
13897                     if (namedclass > OOB_NAMEDCLASS) {
13898                         if (strict || ckWARN(WARN_REGEXP)) {
13899                             const int w =
13900                                 RExC_parse >= rangebegin ?
13901                                 RExC_parse - rangebegin : 0;
13902                             if (strict) {
13903                                 vFAIL4("False [] range \"%*.*s\"",
13904                                     w, w, rangebegin);
13905                             }
13906                             else {
13907                                 vWARN4(RExC_parse,
13908                                     "False [] range \"%*.*s\"",
13909                                     w, w, rangebegin);
13910                             }
13911                         }
13912                         if (!SIZE_ONLY) {
13913                             cp_list = add_cp_to_invlist(cp_list, '-');
13914                         }
13915                         element_count++;
13916                     } else
13917                         range = 1;      /* yeah, it's a range! */
13918                     continue;   /* but do it the next time */
13919                 }
13920             }
13921         }
13922
13923         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13924          * if not */
13925
13926         /* non-Latin1 code point implies unicode semantics.  Must be set in
13927          * pass1 so is there for the whole of pass 2 */
13928         if (value > 255) {
13929             RExC_uni_semantics = 1;
13930         }
13931
13932         /* Ready to process either the single value, or the completed range.
13933          * For single-valued non-inverted ranges, we consider the possibility
13934          * of multi-char folds.  (We made a conscious decision to not do this
13935          * for the other cases because it can often lead to non-intuitive
13936          * results.  For example, you have the peculiar case that:
13937          *  "s s" =~ /^[^\xDF]+$/i => Y
13938          *  "ss"  =~ /^[^\xDF]+$/i => N
13939          *
13940          * See [perl #89750] */
13941         if (FOLD && allow_multi_folds && value == prevvalue) {
13942             if (value == LATIN_SMALL_LETTER_SHARP_S
13943                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13944                                                         value)))
13945             {
13946                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13947
13948                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13949                 STRLEN foldlen;
13950
13951                 UV folded = _to_uni_fold_flags(
13952                                 value,
13953                                 foldbuf,
13954                                 &foldlen,
13955                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13956                                                    ? FOLD_FLAGS_NOMIX_ASCII
13957                                                    : 0)
13958                                 );
13959
13960                 /* Here, <folded> should be the first character of the
13961                  * multi-char fold of <value>, with <foldbuf> containing the
13962                  * whole thing.  But, if this fold is not allowed (because of
13963                  * the flags), <fold> will be the same as <value>, and should
13964                  * be processed like any other character, so skip the special
13965                  * handling */
13966                 if (folded != value) {
13967
13968                     /* Skip if we are recursed, currently parsing the class
13969                      * again.  Otherwise add this character to the list of
13970                      * multi-char folds. */
13971                     if (! RExC_in_multi_char_class) {
13972                         AV** this_array_ptr;
13973                         AV* this_array;
13974                         STRLEN cp_count = utf8_length(foldbuf,
13975                                                       foldbuf + foldlen);
13976                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13977
13978                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13979
13980
13981                         if (! multi_char_matches) {
13982                             multi_char_matches = newAV();
13983                         }
13984
13985                         /* <multi_char_matches> is actually an array of arrays.
13986                          * There will be one or two top-level elements: [2],
13987                          * and/or [3].  The [2] element is an array, each
13988                          * element thereof is a character which folds to TWO
13989                          * characters; [3] is for folds to THREE characters.
13990                          * (Unicode guarantees a maximum of 3 characters in any
13991                          * fold.)  When we rewrite the character class below,
13992                          * we will do so such that the longest folds are
13993                          * written first, so that it prefers the longest
13994                          * matching strings first.  This is done even if it
13995                          * turns out that any quantifier is non-greedy, out of
13996                          * programmer laziness.  Tom Christiansen has agreed
13997                          * that this is ok.  This makes the test for the
13998                          * ligature 'ffi' come before the test for 'ff' */
13999                         if (av_exists(multi_char_matches, cp_count)) {
14000                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
14001                                                              cp_count, FALSE);
14002                             this_array = *this_array_ptr;
14003                         }
14004                         else {
14005                             this_array = newAV();
14006                             av_store(multi_char_matches, cp_count,
14007                                      (SV*) this_array);
14008                         }
14009                         av_push(this_array, multi_fold);
14010                     }
14011
14012                     /* This element should not be processed further in this
14013                      * class */
14014                     element_count--;
14015                     value = save_value;
14016                     prevvalue = save_prevvalue;
14017                     continue;
14018                 }
14019             }
14020         }
14021
14022         /* Deal with this element of the class */
14023         if (! SIZE_ONLY) {
14024 #ifndef EBCDIC
14025             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14026                                                      prevvalue, value);
14027 #else
14028             SV* this_range = _new_invlist(1);
14029             _append_range_to_invlist(this_range, prevvalue, value);
14030
14031             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14032              * If this range was specified using something like 'i-j', we want
14033              * to include only the 'i' and the 'j', and not anything in
14034              * between, so exclude non-ASCII, non-alphabetics from it.
14035              * However, if the range was specified with something like
14036              * [\x89-\x91] or [\x89-j], all code points within it should be
14037              * included.  literal_endpoint==2 means both ends of the range used
14038              * a literal character, not \x{foo} */
14039             if (literal_endpoint == 2
14040                 && ((prevvalue >= 'a' && value <= 'z')
14041                     || (prevvalue >= 'A' && value <= 'Z')))
14042             {
14043                 _invlist_intersection(this_range, PL_ASCII,
14044                                       &this_range);
14045
14046                 /* Since this above only contains ascii, the intersection of it
14047                  * with anything will still yield only ascii */
14048                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14049                                       &this_range);
14050             }
14051             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14052             literal_endpoint = 0;
14053 #endif
14054         }
14055
14056         range = 0; /* this range (if it was one) is done now */
14057     } /* End of loop through all the text within the brackets */
14058
14059     /* If anything in the class expands to more than one character, we have to
14060      * deal with them by building up a substitute parse string, and recursively
14061      * calling reg() on it, instead of proceeding */
14062     if (multi_char_matches) {
14063         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14064         I32 cp_count;
14065         STRLEN len;
14066         char *save_end = RExC_end;
14067         char *save_parse = RExC_parse;
14068         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14069                                        a "|" */
14070         I32 reg_flags;
14071
14072         assert(! invert);
14073 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14074            because too confusing */
14075         if (invert) {
14076             sv_catpv(substitute_parse, "(?:");
14077         }
14078 #endif
14079
14080         /* Look at the longest folds first */
14081         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14082
14083             if (av_exists(multi_char_matches, cp_count)) {
14084                 AV** this_array_ptr;
14085                 SV* this_sequence;
14086
14087                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14088                                                  cp_count, FALSE);
14089                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14090                                                                 &PL_sv_undef)
14091                 {
14092                     if (! first_time) {
14093                         sv_catpv(substitute_parse, "|");
14094                     }
14095                     first_time = FALSE;
14096
14097                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14098                 }
14099             }
14100         }
14101
14102         /* If the character class contains anything else besides these
14103          * multi-character folds, have to include it in recursive parsing */
14104         if (element_count) {
14105             sv_catpv(substitute_parse, "|[");
14106             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14107             sv_catpv(substitute_parse, "]");
14108         }
14109
14110         sv_catpv(substitute_parse, ")");
14111 #if 0
14112         if (invert) {
14113             /* This is a way to get the parse to skip forward a whole named
14114              * sequence instead of matching the 2nd character when it fails the
14115              * first */
14116             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14117         }
14118 #endif
14119
14120         RExC_parse = SvPV(substitute_parse, len);
14121         RExC_end = RExC_parse + len;
14122         RExC_in_multi_char_class = 1;
14123         RExC_emit = (regnode *)orig_emit;
14124
14125         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14126
14127         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14128
14129         RExC_parse = save_parse;
14130         RExC_end = save_end;
14131         RExC_in_multi_char_class = 0;
14132         SvREFCNT_dec_NN(multi_char_matches);
14133         return ret;
14134     }
14135
14136     /* Here, we've gone through the entire class and dealt with multi-char
14137      * folds.  We are now in a position that we can do some checks to see if we
14138      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14139      * Currently we only do two checks:
14140      * 1) is in the unlikely event that the user has specified both, eg. \w and
14141      *    \W under /l, then the class matches everything.  (This optimization
14142      *    is done only to make the optimizer code run later work.)
14143      * 2) if the character class contains only a single element (including a
14144      *    single range), we see if there is an equivalent node for it.
14145      * Other checks are possible */
14146     if (! ret_invlist   /* Can't optimize if returning the constructed
14147                            inversion list */
14148         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14149     {
14150         U8 op = END;
14151         U8 arg = 0;
14152
14153         if (UNLIKELY(posixl_matches_all)) {
14154             op = SANY;
14155         }
14156         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14157                                                    \w or [:digit:] or \p{foo}
14158                                                  */
14159
14160             /* All named classes are mapped into POSIXish nodes, with its FLAG
14161              * argument giving which class it is */
14162             switch ((I32)namedclass) {
14163                 case ANYOF_UNIPROP:
14164                     break;
14165
14166                 /* These don't depend on the charset modifiers.  They always
14167                  * match under /u rules */
14168                 case ANYOF_NHORIZWS:
14169                 case ANYOF_HORIZWS:
14170                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14171                     /* FALLTHROUGH */
14172
14173                 case ANYOF_NVERTWS:
14174                 case ANYOF_VERTWS:
14175                     op = POSIXU;
14176                     goto join_posix;
14177
14178                 /* The actual POSIXish node for all the rest depends on the
14179                  * charset modifier.  The ones in the first set depend only on
14180                  * ASCII or, if available on this platform, locale */
14181                 case ANYOF_ASCII:
14182                 case ANYOF_NASCII:
14183 #ifdef HAS_ISASCII
14184                     op = (LOC) ? POSIXL : POSIXA;
14185 #else
14186                     op = POSIXA;
14187 #endif
14188                     goto join_posix;
14189
14190                 case ANYOF_NCASED:
14191                 case ANYOF_LOWER:
14192                 case ANYOF_NLOWER:
14193                 case ANYOF_UPPER:
14194                 case ANYOF_NUPPER:
14195                     /* under /a could be alpha */
14196                     if (FOLD) {
14197                         if (ASCII_RESTRICTED) {
14198                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14199                         }
14200                         else if (! LOC) {
14201                             break;
14202                         }
14203                     }
14204                     /* FALLTHROUGH */
14205
14206                 /* The rest have more possibilities depending on the charset.
14207                  * We take advantage of the enum ordering of the charset
14208                  * modifiers to get the exact node type, */
14209                 default:
14210                     op = POSIXD + get_regex_charset(RExC_flags);
14211                     if (op > POSIXA) { /* /aa is same as /a */
14212                         op = POSIXA;
14213                     }
14214
14215                 join_posix:
14216                     /* The odd numbered ones are the complements of the
14217                      * next-lower even number one */
14218                     if (namedclass % 2 == 1) {
14219                         invert = ! invert;
14220                         namedclass--;
14221                     }
14222                     arg = namedclass_to_classnum(namedclass);
14223                     break;
14224             }
14225         }
14226         else if (value == prevvalue) {
14227
14228             /* Here, the class consists of just a single code point */
14229
14230             if (invert) {
14231                 if (! LOC && value == '\n') {
14232                     op = REG_ANY; /* Optimize [^\n] */
14233                     *flagp |= HASWIDTH|SIMPLE;
14234                     RExC_naughty++;
14235                 }
14236             }
14237             else if (value < 256 || UTF) {
14238
14239                 /* Optimize a single value into an EXACTish node, but not if it
14240                  * would require converting the pattern to UTF-8. */
14241                 op = compute_EXACTish(pRExC_state);
14242             }
14243         } /* Otherwise is a range */
14244         else if (! LOC) {   /* locale could vary these */
14245             if (prevvalue == '0') {
14246                 if (value == '9') {
14247                     arg = _CC_DIGIT;
14248                     op = POSIXA;
14249                 }
14250             }
14251         }
14252
14253         /* Here, we have changed <op> away from its initial value iff we found
14254          * an optimization */
14255         if (op != END) {
14256
14257             /* Throw away this ANYOF regnode, and emit the calculated one,
14258              * which should correspond to the beginning, not current, state of
14259              * the parse */
14260             const char * cur_parse = RExC_parse;
14261             RExC_parse = (char *)orig_parse;
14262             if ( SIZE_ONLY) {
14263                 if (! LOC) {
14264
14265                     /* To get locale nodes to not use the full ANYOF size would
14266                      * require moving the code above that writes the portions
14267                      * of it that aren't in other nodes to after this point.
14268                      * e.g.  ANYOF_POSIXL_SET */
14269                     RExC_size = orig_size;
14270                 }
14271             }
14272             else {
14273                 RExC_emit = (regnode *)orig_emit;
14274                 if (PL_regkind[op] == POSIXD) {
14275                     if (op == POSIXL) {
14276                         RExC_contains_locale = 1;
14277                     }
14278                     if (invert) {
14279                         op += NPOSIXD - POSIXD;
14280                     }
14281                 }
14282             }
14283
14284             ret = reg_node(pRExC_state, op);
14285
14286             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14287                 if (! SIZE_ONLY) {
14288                     FLAGS(ret) = arg;
14289                 }
14290                 *flagp |= HASWIDTH|SIMPLE;
14291             }
14292             else if (PL_regkind[op] == EXACT) {
14293                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14294                                            TRUE /* downgradable to EXACT */
14295                                            );
14296             }
14297
14298             RExC_parse = (char *) cur_parse;
14299
14300             SvREFCNT_dec(posixes);
14301             SvREFCNT_dec(nposixes);
14302             SvREFCNT_dec(cp_list);
14303             SvREFCNT_dec(cp_foldable_list);
14304             return ret;
14305         }
14306     }
14307
14308     if (SIZE_ONLY)
14309         return ret;
14310     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14311
14312     /* If folding, we calculate all characters that could fold to or from the
14313      * ones already on the list */
14314     if (cp_foldable_list) {
14315         if (FOLD) {
14316             UV start, end;      /* End points of code point ranges */
14317
14318             SV* fold_intersection = NULL;
14319             SV** use_list;
14320
14321             /* Our calculated list will be for Unicode rules.  For locale
14322              * matching, we have to keep a separate list that is consulted at
14323              * runtime only when the locale indicates Unicode rules.  For
14324              * non-locale, we just use to the general list */
14325             if (LOC) {
14326                 use_list = &only_utf8_locale_list;
14327             }
14328             else {
14329                 use_list = &cp_list;
14330             }
14331
14332             /* Only the characters in this class that participate in folds need
14333              * be checked.  Get the intersection of this class and all the
14334              * possible characters that are foldable.  This can quickly narrow
14335              * down a large class */
14336             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14337                                   &fold_intersection);
14338
14339             /* The folds for all the Latin1 characters are hard-coded into this
14340              * program, but we have to go out to disk to get the others. */
14341             if (invlist_highest(cp_foldable_list) >= 256) {
14342
14343                 /* This is a hash that for a particular fold gives all
14344                  * characters that are involved in it */
14345                 if (! PL_utf8_foldclosures) {
14346
14347                     /* If the folds haven't been read in, call a fold function
14348                      * to force that */
14349                     if (! PL_utf8_tofold) {
14350                         U8 dummy[UTF8_MAXBYTES_CASE+1];
14351
14352                         /* This string is just a short named one above \xff */
14353                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14354                         assert(PL_utf8_tofold); /* Verify that worked */
14355                     }
14356                     PL_utf8_foldclosures
14357                                       = _swash_inversion_hash(PL_utf8_tofold);
14358                 }
14359             }
14360
14361             /* Now look at the foldable characters in this class individually */
14362             invlist_iterinit(fold_intersection);
14363             while (invlist_iternext(fold_intersection, &start, &end)) {
14364                 UV j;
14365
14366                 /* Look at every character in the range */
14367                 for (j = start; j <= end; j++) {
14368                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14369                     STRLEN foldlen;
14370                     SV** listp;
14371
14372                     if (j < 256) {
14373
14374                         /* We have the latin1 folding rules hard-coded here so
14375                          * that an innocent-looking character class, like
14376                          * /[ks]/i won't have to go out to disk to find the
14377                          * possible matches.  XXX It would be better to
14378                          * generate these via regen, in case a new version of
14379                          * the Unicode standard adds new mappings, though that
14380                          * is not really likely, and may be caught by the
14381                          * default: case of the switch below. */
14382
14383                         if (IS_IN_SOME_FOLD_L1(j)) {
14384
14385                             /* ASCII is always matched; non-ASCII is matched
14386                              * only under Unicode rules (which could happen
14387                              * under /l if the locale is a UTF-8 one */
14388                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14389                                 *use_list = add_cp_to_invlist(*use_list,
14390                                                             PL_fold_latin1[j]);
14391                             }
14392                             else {
14393                                 depends_list =
14394                                  add_cp_to_invlist(depends_list,
14395                                                    PL_fold_latin1[j]);
14396                             }
14397                         }
14398
14399                         if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14400                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14401                         {
14402                             /* Certain Latin1 characters have matches outside
14403                             * Latin1.  To get here, <j> is one of those
14404                             * characters.   None of these matches is valid for
14405                             * ASCII characters under /aa, which is why the 'if'
14406                             * just above excludes those.  These matches only
14407                             * happen when the target string is utf8.  The code
14408                             * below adds the single fold closures for <j> to the
14409                             * inversion list. */
14410
14411                             switch (j) {
14412                                 case 'k':
14413                                 case 'K':
14414                                   *use_list =
14415                                      add_cp_to_invlist(*use_list, KELVIN_SIGN);
14416                                     break;
14417                                 case 's':
14418                                 case 'S':
14419                                   *use_list = add_cp_to_invlist(*use_list,
14420                                                     LATIN_SMALL_LETTER_LONG_S);
14421                                     break;
14422                                 case MICRO_SIGN:
14423                                   *use_list = add_cp_to_invlist(*use_list,
14424                                                       GREEK_CAPITAL_LETTER_MU);
14425                                   *use_list = add_cp_to_invlist(*use_list,
14426                                                         GREEK_SMALL_LETTER_MU);
14427                                     break;
14428                                 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14429                                 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14430                                   *use_list =
14431                                    add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14432                                     break;
14433                                 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14434                                   *use_list = add_cp_to_invlist(*use_list,
14435                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14436                                     break;
14437                                 case LATIN_SMALL_LETTER_SHARP_S:
14438                                   *use_list = add_cp_to_invlist(*use_list,
14439                                                  LATIN_CAPITAL_LETTER_SHARP_S);
14440                                     break;
14441                                 case 'F': case 'f':
14442                                 case 'I': case 'i':
14443                                 case 'L': case 'l':
14444                                 case 'T': case 't':
14445                                 case 'A': case 'a':
14446                                 case 'H': case 'h':
14447                                 case 'J': case 'j':
14448                                 case 'N': case 'n':
14449                                 case 'W': case 'w':
14450                                 case 'Y': case 'y':
14451                                     /* These all are targets of multi-character
14452                                      * folds from code points that require UTF8
14453                                      * to express, so they can't match unless
14454                                      * the target string is in UTF-8, so no
14455                                      * action here is necessary, as regexec.c
14456                                      * properly handles the general case for
14457                                      * UTF-8 matching and multi-char folds */
14458                                     break;
14459                                 default:
14460                                     /* Use deprecated warning to increase the
14461                                     * chances of this being output */
14462                                     ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14463                                     break;
14464                             }
14465                         }
14466                         continue;
14467                     }
14468
14469                     /* Here is an above Latin1 character.  We don't have the
14470                      * rules hard-coded for it.  First, get its fold.  This is
14471                      * the simple fold, as the multi-character folds have been
14472                      * handled earlier and separated out */
14473                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14474                                                         (ASCII_FOLD_RESTRICTED)
14475                                                         ? FOLD_FLAGS_NOMIX_ASCII
14476                                                         : 0);
14477
14478                     /* Single character fold of above Latin1.  Add everything in
14479                     * its fold closure to the list that this node should match.
14480                     * The fold closures data structure is a hash with the keys
14481                     * being the UTF-8 of every character that is folded to, like
14482                     * 'k', and the values each an array of all code points that
14483                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14484                     * Multi-character folds are not included */
14485                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14486                                         (char *) foldbuf, foldlen, FALSE)))
14487                     {
14488                         AV* list = (AV*) *listp;
14489                         IV k;
14490                         for (k = 0; k <= av_tindex(list); k++) {
14491                             SV** c_p = av_fetch(list, k, FALSE);
14492                             UV c;
14493                             if (c_p == NULL) {
14494                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14495                             }
14496                             c = SvUV(*c_p);
14497
14498                             /* /aa doesn't allow folds between ASCII and non- */
14499                             if ((ASCII_FOLD_RESTRICTED
14500                                 && (isASCII(c) != isASCII(j))))
14501                             {
14502                                 continue;
14503                             }
14504
14505                             /* Folds under /l which cross the 255/256 boundary
14506                              * are added to a separate list.  (These are valid
14507                              * only when the locale is UTF-8.) */
14508                             if (c < 256 && LOC) {
14509                                 *use_list = add_cp_to_invlist(*use_list, c);
14510                                 continue;
14511                             }
14512
14513                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14514                             {
14515                                 cp_list = add_cp_to_invlist(cp_list, c);
14516                             }
14517                             else {
14518                                 /* Similarly folds involving non-ascii Latin1
14519                                 * characters under /d are added to their list */
14520                                 depends_list = add_cp_to_invlist(depends_list,
14521                                                                  c);
14522                             }
14523                         }
14524                     }
14525                 }
14526             }
14527             SvREFCNT_dec_NN(fold_intersection);
14528         }
14529
14530         /* Now that we have finished adding all the folds, there is no reason
14531          * to keep the foldable list separate */
14532         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14533         SvREFCNT_dec_NN(cp_foldable_list);
14534     }
14535
14536     /* And combine the result (if any) with any inversion list from posix
14537      * classes.  The lists are kept separate up to now because we don't want to
14538      * fold the classes (folding of those is automatically handled by the swash
14539      * fetching code) */
14540     if (posixes || nposixes) {
14541         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14542             /* Under /a and /aa, nothing above ASCII matches these */
14543             _invlist_intersection(posixes,
14544                                   PL_XPosix_ptrs[_CC_ASCII],
14545                                   &posixes);
14546         }
14547         if (nposixes) {
14548             if (DEPENDS_SEMANTICS) {
14549                 /* Under /d, everything in the upper half of the Latin1 range
14550                  * matches these complements */
14551                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14552             }
14553             else if (AT_LEAST_ASCII_RESTRICTED) {
14554                 /* Under /a and /aa, everything above ASCII matches these
14555                  * complements */
14556                 _invlist_union_complement_2nd(nposixes,
14557                                               PL_XPosix_ptrs[_CC_ASCII],
14558                                               &nposixes);
14559             }
14560             if (posixes) {
14561                 _invlist_union(posixes, nposixes, &posixes);
14562                 SvREFCNT_dec_NN(nposixes);
14563             }
14564             else {
14565                 posixes = nposixes;
14566             }
14567         }
14568         if (! DEPENDS_SEMANTICS) {
14569             if (cp_list) {
14570                 _invlist_union(cp_list, posixes, &cp_list);
14571                 SvREFCNT_dec_NN(posixes);
14572             }
14573             else {
14574                 cp_list = posixes;
14575             }
14576         }
14577         else {
14578             /* Under /d, we put into a separate list the Latin1 things that
14579              * match only when the target string is utf8 */
14580             SV* nonascii_but_latin1_properties = NULL;
14581             _invlist_intersection(posixes, PL_UpperLatin1,
14582                                   &nonascii_but_latin1_properties);
14583             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14584                               &posixes);
14585             if (cp_list) {
14586                 _invlist_union(cp_list, posixes, &cp_list);
14587                 SvREFCNT_dec_NN(posixes);
14588             }
14589             else {
14590                 cp_list = posixes;
14591             }
14592
14593             if (depends_list) {
14594                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14595                                &depends_list);
14596                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14597             }
14598             else {
14599                 depends_list = nonascii_but_latin1_properties;
14600             }
14601         }
14602     }
14603
14604     /* And combine the result (if any) with any inversion list from properties.
14605      * The lists are kept separate up to now so that we can distinguish the two
14606      * in regards to matching above-Unicode.  A run-time warning is generated
14607      * if a Unicode property is matched against a non-Unicode code point. But,
14608      * we allow user-defined properties to match anything, without any warning,
14609      * and we also suppress the warning if there is a portion of the character
14610      * class that isn't a Unicode property, and which matches above Unicode, \W
14611      * or [\x{110000}] for example.
14612      * (Note that in this case, unlike the Posix one above, there is no
14613      * <depends_list>, because having a Unicode property forces Unicode
14614      * semantics */
14615     if (properties) {
14616         if (cp_list) {
14617
14618             /* If it matters to the final outcome, see if a non-property
14619              * component of the class matches above Unicode.  If so, the
14620              * warning gets suppressed.  This is true even if just a single
14621              * such code point is specified, as though not strictly correct if
14622              * another such code point is matched against, the fact that they
14623              * are using above-Unicode code points indicates they should know
14624              * the issues involved */
14625             if (warn_super) {
14626                 warn_super = ! (invert
14627                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14628             }
14629
14630             _invlist_union(properties, cp_list, &cp_list);
14631             SvREFCNT_dec_NN(properties);
14632         }
14633         else {
14634             cp_list = properties;
14635         }
14636
14637         if (warn_super) {
14638             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14639         }
14640     }
14641
14642     /* Here, we have calculated what code points should be in the character
14643      * class.
14644      *
14645      * Now we can see about various optimizations.  Fold calculation (which we
14646      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14647      * would invert to include K, which under /i would match k, which it
14648      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14649      * folded until runtime */
14650
14651     /* If we didn't do folding, it's because some information isn't available
14652      * until runtime; set the run-time fold flag for these.  (We don't have to
14653      * worry about properties folding, as that is taken care of by the swash
14654      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14655      * locales, or the class matches at least one 0-255 range code point */
14656     if (LOC && FOLD) {
14657         if (only_utf8_locale_list) {
14658             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14659         }
14660         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14661                                the list */
14662             UV start, end;
14663             invlist_iterinit(cp_list);
14664             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14665                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14666             }
14667             invlist_iterfinish(cp_list);
14668         }
14669     }
14670
14671     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14672      * at compile time.  Besides not inverting folded locale now, we can't
14673      * invert if there are things such as \w, which aren't known until runtime
14674      * */
14675     if (cp_list
14676         && invert
14677         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14678         && ! depends_list
14679         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14680     {
14681         _invlist_invert(cp_list);
14682
14683         /* Any swash can't be used as-is, because we've inverted things */
14684         if (swash) {
14685             SvREFCNT_dec_NN(swash);
14686             swash = NULL;
14687         }
14688
14689         /* Clear the invert flag since have just done it here */
14690         invert = FALSE;
14691     }
14692
14693     if (ret_invlist) {
14694         *ret_invlist = cp_list;
14695         SvREFCNT_dec(swash);
14696
14697         /* Discard the generated node */
14698         if (SIZE_ONLY) {
14699             RExC_size = orig_size;
14700         }
14701         else {
14702             RExC_emit = orig_emit;
14703         }
14704         return orig_emit;
14705     }
14706
14707     /* Some character classes are equivalent to other nodes.  Such nodes take
14708      * up less room and generally fewer operations to execute than ANYOF nodes.
14709      * Above, we checked for and optimized into some such equivalents for
14710      * certain common classes that are easy to test.  Getting to this point in
14711      * the code means that the class didn't get optimized there.  Since this
14712      * code is only executed in Pass 2, it is too late to save space--it has
14713      * been allocated in Pass 1, and currently isn't given back.  But turning
14714      * things into an EXACTish node can allow the optimizer to join it to any
14715      * adjacent such nodes.  And if the class is equivalent to things like /./,
14716      * expensive run-time swashes can be avoided.  Now that we have more
14717      * complete information, we can find things necessarily missed by the
14718      * earlier code.  I (khw) am not sure how much to look for here.  It would
14719      * be easy, but perhaps too slow, to check any candidates against all the
14720      * node types they could possibly match using _invlistEQ(). */
14721
14722     if (cp_list
14723         && ! invert
14724         && ! depends_list
14725         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14726         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14727
14728            /* We don't optimize if we are supposed to make sure all non-Unicode
14729             * code points raise a warning, as only ANYOF nodes have this check.
14730             * */
14731         && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14732     {
14733         UV start, end;
14734         U8 op = END;  /* The optimzation node-type */
14735         const char * cur_parse= RExC_parse;
14736
14737         invlist_iterinit(cp_list);
14738         if (! invlist_iternext(cp_list, &start, &end)) {
14739
14740             /* Here, the list is empty.  This happens, for example, when a
14741              * Unicode property is the only thing in the character class, and
14742              * it doesn't match anything.  (perluniprops.pod notes such
14743              * properties) */
14744             op = OPFAIL;
14745             *flagp |= HASWIDTH|SIMPLE;
14746         }
14747         else if (start == end) {    /* The range is a single code point */
14748             if (! invlist_iternext(cp_list, &start, &end)
14749
14750                     /* Don't do this optimization if it would require changing
14751                      * the pattern to UTF-8 */
14752                 && (start < 256 || UTF))
14753             {
14754                 /* Here, the list contains a single code point.  Can optimize
14755                  * into an EXACTish node */
14756
14757                 value = start;
14758
14759                 if (! FOLD) {
14760                     op = EXACT;
14761                 }
14762                 else if (LOC) {
14763
14764                     /* A locale node under folding with one code point can be
14765                      * an EXACTFL, as its fold won't be calculated until
14766                      * runtime */
14767                     op = EXACTFL;
14768                 }
14769                 else {
14770
14771                     /* Here, we are generally folding, but there is only one
14772                      * code point to match.  If we have to, we use an EXACT
14773                      * node, but it would be better for joining with adjacent
14774                      * nodes in the optimization pass if we used the same
14775                      * EXACTFish node that any such are likely to be.  We can
14776                      * do this iff the code point doesn't participate in any
14777                      * folds.  For example, an EXACTF of a colon is the same as
14778                      * an EXACT one, since nothing folds to or from a colon. */
14779                     if (value < 256) {
14780                         if (IS_IN_SOME_FOLD_L1(value)) {
14781                             op = EXACT;
14782                         }
14783                     }
14784                     else {
14785                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14786                             op = EXACT;
14787                         }
14788                     }
14789
14790                     /* If we haven't found the node type, above, it means we
14791                      * can use the prevailing one */
14792                     if (op == END) {
14793                         op = compute_EXACTish(pRExC_state);
14794                     }
14795                 }
14796             }
14797         }
14798         else if (start == 0) {
14799             if (end == UV_MAX) {
14800                 op = SANY;
14801                 *flagp |= HASWIDTH|SIMPLE;
14802                 RExC_naughty++;
14803             }
14804             else if (end == '\n' - 1
14805                     && invlist_iternext(cp_list, &start, &end)
14806                     && start == '\n' + 1 && end == UV_MAX)
14807             {
14808                 op = REG_ANY;
14809                 *flagp |= HASWIDTH|SIMPLE;
14810                 RExC_naughty++;
14811             }
14812         }
14813         invlist_iterfinish(cp_list);
14814
14815         if (op != END) {
14816             RExC_parse = (char *)orig_parse;
14817             RExC_emit = (regnode *)orig_emit;
14818
14819             ret = reg_node(pRExC_state, op);
14820
14821             RExC_parse = (char *)cur_parse;
14822
14823             if (PL_regkind[op] == EXACT) {
14824                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14825                                            TRUE /* downgradable to EXACT */
14826                                           );
14827             }
14828
14829             SvREFCNT_dec_NN(cp_list);
14830             return ret;
14831         }
14832     }
14833
14834     /* Here, <cp_list> contains all the code points we can determine at
14835      * compile time that match under all conditions.  Go through it, and
14836      * for things that belong in the bitmap, put them there, and delete from
14837      * <cp_list>.  While we are at it, see if everything above 255 is in the
14838      * list, and if so, set a flag to speed up execution */
14839
14840     populate_ANYOF_from_invlist(ret, &cp_list);
14841
14842     if (invert) {
14843         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14844     }
14845
14846     /* Here, the bitmap has been populated with all the Latin1 code points that
14847      * always match.  Can now add to the overall list those that match only
14848      * when the target string is UTF-8 (<depends_list>). */
14849     if (depends_list) {
14850         if (cp_list) {
14851             _invlist_union(cp_list, depends_list, &cp_list);
14852             SvREFCNT_dec_NN(depends_list);
14853         }
14854         else {
14855             cp_list = depends_list;
14856         }
14857         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14858     }
14859
14860     /* If there is a swash and more than one element, we can't use the swash in
14861      * the optimization below. */
14862     if (swash && element_count > 1) {
14863         SvREFCNT_dec_NN(swash);
14864         swash = NULL;
14865     }
14866
14867     set_ANYOF_arg(pRExC_state, ret, cp_list,
14868                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14869                    ? listsv : NULL,
14870                   only_utf8_locale_list,
14871                   swash, has_user_defined_property);
14872
14873     *flagp |= HASWIDTH|SIMPLE;
14874
14875     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14876         RExC_contains_locale = 1;
14877     }
14878
14879     return ret;
14880 }
14881
14882 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14883
14884 STATIC void
14885 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14886                 regnode* const node,
14887                 SV* const cp_list,
14888                 SV* const runtime_defns,
14889                 SV* const only_utf8_locale_list,
14890                 SV* const swash,
14891                 const bool has_user_defined_property)
14892 {
14893     /* Sets the arg field of an ANYOF-type node 'node', using information about
14894      * the node passed-in.  If there is nothing outside the node's bitmap, the
14895      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14896      * the count returned by add_data(), having allocated and stored an array,
14897      * av, that that count references, as follows:
14898      *  av[0] stores the character class description in its textual form.
14899      *        This is used later (regexec.c:Perl_regclass_swash()) to
14900      *        initialize the appropriate swash, and is also useful for dumping
14901      *        the regnode.  This is set to &PL_sv_undef if the textual
14902      *        description is not needed at run-time (as happens if the other
14903      *        elements completely define the class)
14904      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14905      *        computed from av[0].  But if no further computation need be done,
14906      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14907      *  av[2] stores the inversion list of code points that match only if the
14908      *        current locale is UTF-8
14909      *  av[3] stores the cp_list inversion list for use in addition or instead
14910      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14911      *        (Otherwise everything needed is already in av[0] and av[1])
14912      *  av[4] is set if any component of the class is from a user-defined
14913      *        property; used only if av[3] exists */
14914
14915     UV n;
14916
14917     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14918
14919     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14920         assert(! (ANYOF_FLAGS(node)
14921                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14922         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14923     }
14924     else {
14925         AV * const av = newAV();
14926         SV *rv;
14927
14928         assert(ANYOF_FLAGS(node)
14929                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14930
14931         av_store(av, 0, (runtime_defns)
14932                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14933         if (swash) {
14934             av_store(av, 1, swash);
14935             SvREFCNT_dec_NN(cp_list);
14936         }
14937         else {
14938             av_store(av, 1, &PL_sv_undef);
14939             if (cp_list) {
14940                 av_store(av, 3, cp_list);
14941                 av_store(av, 4, newSVuv(has_user_defined_property));
14942             }
14943         }
14944
14945         if (only_utf8_locale_list) {
14946             av_store(av, 2, only_utf8_locale_list);
14947         }
14948         else {
14949             av_store(av, 2, &PL_sv_undef);
14950         }
14951
14952         rv = newRV_noinc(MUTABLE_SV(av));
14953         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14954         RExC_rxi->data->data[n] = (void*)rv;
14955         ARG_SET(node, n);
14956     }
14957 }
14958
14959
14960 /* reg_skipcomment()
14961
14962    Absorbs an /x style # comments from the input stream.
14963    Returns true if there is more text remaining in the stream.
14964    Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14965    terminates the pattern without including a newline.
14966
14967    Note its the callers responsibility to ensure that we are
14968    actually in /x mode
14969
14970 */
14971
14972 STATIC bool
14973 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14974 {
14975     bool ended = 0;
14976
14977     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14978
14979     while (RExC_parse < RExC_end)
14980         if (*RExC_parse++ == '\n') {
14981             ended = 1;
14982             break;
14983         }
14984     if (!ended) {
14985         /* we ran off the end of the pattern without ending
14986            the comment, so we have to add an \n when wrapping */
14987         RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
14988         return 0;
14989     } else
14990         return 1;
14991 }
14992
14993 /* nextchar()
14994
14995    Advances the parse position, and optionally absorbs
14996    "whitespace" from the inputstream.
14997
14998    Without /x "whitespace" means (?#...) style comments only,
14999    with /x this means (?#...) and # comments and whitespace proper.
15000
15001    Returns the RExC_parse point from BEFORE the scan occurs.
15002
15003    This is the /x friendly way of saying RExC_parse++.
15004 */
15005
15006 STATIC char*
15007 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15008 {
15009     char* const retval = RExC_parse++;
15010
15011     PERL_ARGS_ASSERT_NEXTCHAR;
15012
15013     for (;;) {
15014         if (RExC_end - RExC_parse >= 3
15015             && *RExC_parse == '('
15016             && RExC_parse[1] == '?'
15017             && RExC_parse[2] == '#')
15018         {
15019             while (*RExC_parse != ')') {
15020                 if (RExC_parse == RExC_end)
15021                     FAIL("Sequence (?#... not terminated");
15022                 RExC_parse++;
15023             }
15024             RExC_parse++;
15025             continue;
15026         }
15027         if (RExC_flags & RXf_PMf_EXTENDED) {
15028             if (isSPACE(*RExC_parse)) {
15029                 RExC_parse++;
15030                 continue;
15031             }
15032             else if (*RExC_parse == '#') {
15033                 if ( reg_skipcomment( pRExC_state ) )
15034                     continue;
15035             }
15036         }
15037         return retval;
15038     }
15039 }
15040
15041 /*
15042 - reg_node - emit a node
15043 */
15044 STATIC regnode *                        /* Location. */
15045 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15046 {
15047     dVAR;
15048     regnode *ptr;
15049     regnode * const ret = RExC_emit;
15050     GET_RE_DEBUG_FLAGS_DECL;
15051
15052     PERL_ARGS_ASSERT_REG_NODE;
15053
15054     if (SIZE_ONLY) {
15055         SIZE_ALIGN(RExC_size);
15056         RExC_size += 1;
15057         return(ret);
15058     }
15059     if (RExC_emit >= RExC_emit_bound)
15060         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15061                    op, RExC_emit, RExC_emit_bound);
15062
15063     NODE_ALIGN_FILL(ret);
15064     ptr = ret;
15065     FILL_ADVANCE_NODE(ptr, op);
15066 #ifdef RE_TRACK_PATTERN_OFFSETS
15067     if (RExC_offsets) {         /* MJD */
15068         MJD_OFFSET_DEBUG(
15069               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15070               "reg_node", __LINE__,
15071               PL_reg_name[op],
15072               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15073                 ? "Overwriting end of array!\n" : "OK",
15074               (UV)(RExC_emit - RExC_emit_start),
15075               (UV)(RExC_parse - RExC_start),
15076               (UV)RExC_offsets[0]));
15077         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15078     }
15079 #endif
15080     RExC_emit = ptr;
15081     return(ret);
15082 }
15083
15084 /*
15085 - reganode - emit a node with an argument
15086 */
15087 STATIC regnode *                        /* Location. */
15088 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15089 {
15090     dVAR;
15091     regnode *ptr;
15092     regnode * const ret = RExC_emit;
15093     GET_RE_DEBUG_FLAGS_DECL;
15094
15095     PERL_ARGS_ASSERT_REGANODE;
15096
15097     if (SIZE_ONLY) {
15098         SIZE_ALIGN(RExC_size);
15099         RExC_size += 2;
15100         /*
15101            We can't do this:
15102
15103            assert(2==regarglen[op]+1);
15104
15105            Anything larger than this has to allocate the extra amount.
15106            If we changed this to be:
15107
15108            RExC_size += (1 + regarglen[op]);
15109
15110            then it wouldn't matter. Its not clear what side effect
15111            might come from that so its not done so far.
15112            -- dmq
15113         */
15114         return(ret);
15115     }
15116     if (RExC_emit >= RExC_emit_bound)
15117         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15118                    op, RExC_emit, RExC_emit_bound);
15119
15120     NODE_ALIGN_FILL(ret);
15121     ptr = ret;
15122     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15123 #ifdef RE_TRACK_PATTERN_OFFSETS
15124     if (RExC_offsets) {         /* MJD */
15125         MJD_OFFSET_DEBUG(
15126               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15127               "reganode",
15128               __LINE__,
15129               PL_reg_name[op],
15130               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15131               "Overwriting end of array!\n" : "OK",
15132               (UV)(RExC_emit - RExC_emit_start),
15133               (UV)(RExC_parse - RExC_start),
15134               (UV)RExC_offsets[0]));
15135         Set_Cur_Node_Offset;
15136     }
15137 #endif
15138     RExC_emit = ptr;
15139     return(ret);
15140 }
15141
15142 /*
15143 - reguni - emit (if appropriate) a Unicode character
15144 */
15145 PERL_STATIC_INLINE STRLEN
15146 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15147 {
15148     dVAR;
15149
15150     PERL_ARGS_ASSERT_REGUNI;
15151
15152     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15153 }
15154
15155 /*
15156 - reginsert - insert an operator in front of already-emitted operand
15157 *
15158 * Means relocating the operand.
15159 */
15160 STATIC void
15161 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15162 {
15163     dVAR;
15164     regnode *src;
15165     regnode *dst;
15166     regnode *place;
15167     const int offset = regarglen[(U8)op];
15168     const int size = NODE_STEP_REGNODE + offset;
15169     GET_RE_DEBUG_FLAGS_DECL;
15170
15171     PERL_ARGS_ASSERT_REGINSERT;
15172     PERL_UNUSED_ARG(depth);
15173 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15174     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15175     if (SIZE_ONLY) {
15176         RExC_size += size;
15177         return;
15178     }
15179
15180     src = RExC_emit;
15181     RExC_emit += size;
15182     dst = RExC_emit;
15183     if (RExC_open_parens) {
15184         int paren;
15185         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15186         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15187             if ( RExC_open_parens[paren] >= opnd ) {
15188                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15189                 RExC_open_parens[paren] += size;
15190             } else {
15191                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15192             }
15193             if ( RExC_close_parens[paren] >= opnd ) {
15194                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15195                 RExC_close_parens[paren] += size;
15196             } else {
15197                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15198             }
15199         }
15200     }
15201
15202     while (src > opnd) {
15203         StructCopy(--src, --dst, regnode);
15204 #ifdef RE_TRACK_PATTERN_OFFSETS
15205         if (RExC_offsets) {     /* MJD 20010112 */
15206             MJD_OFFSET_DEBUG(
15207                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15208                   "reg_insert",
15209                   __LINE__,
15210                   PL_reg_name[op],
15211                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15212                     ? "Overwriting end of array!\n" : "OK",
15213                   (UV)(src - RExC_emit_start),
15214                   (UV)(dst - RExC_emit_start),
15215                   (UV)RExC_offsets[0]));
15216             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15217             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15218         }
15219 #endif
15220     }
15221
15222
15223     place = opnd;               /* Op node, where operand used to be. */
15224 #ifdef RE_TRACK_PATTERN_OFFSETS
15225     if (RExC_offsets) {         /* MJD */
15226         MJD_OFFSET_DEBUG(
15227               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15228               "reginsert",
15229               __LINE__,
15230               PL_reg_name[op],
15231               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15232               ? "Overwriting end of array!\n" : "OK",
15233               (UV)(place - RExC_emit_start),
15234               (UV)(RExC_parse - RExC_start),
15235               (UV)RExC_offsets[0]));
15236         Set_Node_Offset(place, RExC_parse);
15237         Set_Node_Length(place, 1);
15238     }
15239 #endif
15240     src = NEXTOPER(place);
15241     FILL_ADVANCE_NODE(place, op);
15242     Zero(src, offset, regnode);
15243 }
15244
15245 /*
15246 - regtail - set the next-pointer at the end of a node chain of p to val.
15247 - SEE ALSO: regtail_study
15248 */
15249 /* TODO: All three parms should be const */
15250 STATIC void
15251 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15252                 const regnode *val,U32 depth)
15253 {
15254     dVAR;
15255     regnode *scan;
15256     GET_RE_DEBUG_FLAGS_DECL;
15257
15258     PERL_ARGS_ASSERT_REGTAIL;
15259 #ifndef DEBUGGING
15260     PERL_UNUSED_ARG(depth);
15261 #endif
15262
15263     if (SIZE_ONLY)
15264         return;
15265
15266     /* Find last node. */
15267     scan = p;
15268     for (;;) {
15269         regnode * const temp = regnext(scan);
15270         DEBUG_PARSE_r({
15271             SV * const mysv=sv_newmortal();
15272             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15273             regprop(RExC_rx, mysv, scan, NULL);
15274             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15275                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15276                     (temp == NULL ? "->" : ""),
15277                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15278             );
15279         });
15280         if (temp == NULL)
15281             break;
15282         scan = temp;
15283     }
15284
15285     if (reg_off_by_arg[OP(scan)]) {
15286         ARG_SET(scan, val - scan);
15287     }
15288     else {
15289         NEXT_OFF(scan) = val - scan;
15290     }
15291 }
15292
15293 #ifdef DEBUGGING
15294 /*
15295 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15296 - Look for optimizable sequences at the same time.
15297 - currently only looks for EXACT chains.
15298
15299 This is experimental code. The idea is to use this routine to perform
15300 in place optimizations on branches and groups as they are constructed,
15301 with the long term intention of removing optimization from study_chunk so
15302 that it is purely analytical.
15303
15304 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15305 to control which is which.
15306
15307 */
15308 /* TODO: All four parms should be const */
15309
15310 STATIC U8
15311 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15312                       const regnode *val,U32 depth)
15313 {
15314     dVAR;
15315     regnode *scan;
15316     U8 exact = PSEUDO;
15317 #ifdef EXPERIMENTAL_INPLACESCAN
15318     I32 min = 0;
15319 #endif
15320     GET_RE_DEBUG_FLAGS_DECL;
15321
15322     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15323
15324
15325     if (SIZE_ONLY)
15326         return exact;
15327
15328     /* Find last node. */
15329
15330     scan = p;
15331     for (;;) {
15332         regnode * const temp = regnext(scan);
15333 #ifdef EXPERIMENTAL_INPLACESCAN
15334         if (PL_regkind[OP(scan)] == EXACT) {
15335             bool unfolded_multi_char;   /* Unexamined in this routine */
15336             if (join_exact(pRExC_state, scan, &min,
15337                            &unfolded_multi_char, 1, val, depth+1))
15338                 return EXACT;
15339         }
15340 #endif
15341         if ( exact ) {
15342             switch (OP(scan)) {
15343                 case EXACT:
15344                 case EXACTF:
15345                 case EXACTFA_NO_TRIE:
15346                 case EXACTFA:
15347                 case EXACTFU:
15348                 case EXACTFU_SS:
15349                 case EXACTFL:
15350                         if( exact == PSEUDO )
15351                             exact= OP(scan);
15352                         else if ( exact != OP(scan) )
15353                             exact= 0;
15354                 case NOTHING:
15355                     break;
15356                 default:
15357                     exact= 0;
15358             }
15359         }
15360         DEBUG_PARSE_r({
15361             SV * const mysv=sv_newmortal();
15362             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15363             regprop(RExC_rx, mysv, scan, NULL);
15364             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15365                 SvPV_nolen_const(mysv),
15366                 REG_NODE_NUM(scan),
15367                 PL_reg_name[exact]);
15368         });
15369         if (temp == NULL)
15370             break;
15371         scan = temp;
15372     }
15373     DEBUG_PARSE_r({
15374         SV * const mysv_val=sv_newmortal();
15375         DEBUG_PARSE_MSG("");
15376         regprop(RExC_rx, mysv_val, val, NULL);
15377         PerlIO_printf(Perl_debug_log,
15378                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15379                       SvPV_nolen_const(mysv_val),
15380                       (IV)REG_NODE_NUM(val),
15381                       (IV)(val - scan)
15382         );
15383     });
15384     if (reg_off_by_arg[OP(scan)]) {
15385         ARG_SET(scan, val - scan);
15386     }
15387     else {
15388         NEXT_OFF(scan) = val - scan;
15389     }
15390
15391     return exact;
15392 }
15393 #endif
15394
15395 /*
15396  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15397  */
15398 #ifdef DEBUGGING
15399
15400 static void
15401 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15402 {
15403     int bit;
15404     int set=0;
15405
15406     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15407
15408     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15409         if (flags & (1<<bit)) {
15410             if (!set++ && lead)
15411                 PerlIO_printf(Perl_debug_log, "%s",lead);
15412             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15413         }
15414     }
15415     if (lead)  {
15416         if (set)
15417             PerlIO_printf(Perl_debug_log, "\n");
15418         else
15419             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15420     }
15421 }
15422
15423 static void
15424 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15425 {
15426     int bit;
15427     int set=0;
15428     regex_charset cs;
15429
15430     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15431
15432     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15433         if (flags & (1<<bit)) {
15434             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15435                 continue;
15436             }
15437             if (!set++ && lead)
15438                 PerlIO_printf(Perl_debug_log, "%s",lead);
15439             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15440         }
15441     }
15442     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15443             if (!set++ && lead) {
15444                 PerlIO_printf(Perl_debug_log, "%s",lead);
15445             }
15446             switch (cs) {
15447                 case REGEX_UNICODE_CHARSET:
15448                     PerlIO_printf(Perl_debug_log, "UNICODE");
15449                     break;
15450                 case REGEX_LOCALE_CHARSET:
15451                     PerlIO_printf(Perl_debug_log, "LOCALE");
15452                     break;
15453                 case REGEX_ASCII_RESTRICTED_CHARSET:
15454                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15455                     break;
15456                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15457                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15458                     break;
15459                 default:
15460                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15461                     break;
15462             }
15463     }
15464     if (lead)  {
15465         if (set)
15466             PerlIO_printf(Perl_debug_log, "\n");
15467         else
15468             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15469     }
15470 }
15471 #endif
15472
15473 void
15474 Perl_regdump(pTHX_ const regexp *r)
15475 {
15476 #ifdef DEBUGGING
15477     dVAR;
15478     SV * const sv = sv_newmortal();
15479     SV *dsv= sv_newmortal();
15480     RXi_GET_DECL(r,ri);
15481     GET_RE_DEBUG_FLAGS_DECL;
15482
15483     PERL_ARGS_ASSERT_REGDUMP;
15484
15485     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15486
15487     /* Header fields of interest. */
15488     if (r->anchored_substr) {
15489         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15490             RE_SV_DUMPLEN(r->anchored_substr), 30);
15491         PerlIO_printf(Perl_debug_log,
15492                       "anchored %s%s at %"IVdf" ",
15493                       s, RE_SV_TAIL(r->anchored_substr),
15494                       (IV)r->anchored_offset);
15495     } else if (r->anchored_utf8) {
15496         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15497             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15498         PerlIO_printf(Perl_debug_log,
15499                       "anchored utf8 %s%s at %"IVdf" ",
15500                       s, RE_SV_TAIL(r->anchored_utf8),
15501                       (IV)r->anchored_offset);
15502     }
15503     if (r->float_substr) {
15504         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15505             RE_SV_DUMPLEN(r->float_substr), 30);
15506         PerlIO_printf(Perl_debug_log,
15507                       "floating %s%s at %"IVdf"..%"UVuf" ",
15508                       s, RE_SV_TAIL(r->float_substr),
15509                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15510     } else if (r->float_utf8) {
15511         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15512             RE_SV_DUMPLEN(r->float_utf8), 30);
15513         PerlIO_printf(Perl_debug_log,
15514                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15515                       s, RE_SV_TAIL(r->float_utf8),
15516                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15517     }
15518     if (r->check_substr || r->check_utf8)
15519         PerlIO_printf(Perl_debug_log,
15520                       (const char *)
15521                       (r->check_substr == r->float_substr
15522                        && r->check_utf8 == r->float_utf8
15523                        ? "(checking floating" : "(checking anchored"));
15524     if (r->intflags & PREGf_NOSCAN)
15525         PerlIO_printf(Perl_debug_log, " noscan");
15526     if (r->extflags & RXf_CHECK_ALL)
15527         PerlIO_printf(Perl_debug_log, " isall");
15528     if (r->check_substr || r->check_utf8)
15529         PerlIO_printf(Perl_debug_log, ") ");
15530
15531     if (ri->regstclass) {
15532         regprop(r, sv, ri->regstclass, NULL);
15533         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15534     }
15535     if (r->intflags & PREGf_ANCH) {
15536         PerlIO_printf(Perl_debug_log, "anchored");
15537         if (r->intflags & PREGf_ANCH_BOL)
15538             PerlIO_printf(Perl_debug_log, "(BOL)");
15539         if (r->intflags & PREGf_ANCH_MBOL)
15540             PerlIO_printf(Perl_debug_log, "(MBOL)");
15541         if (r->intflags & PREGf_ANCH_SBOL)
15542             PerlIO_printf(Perl_debug_log, "(SBOL)");
15543         if (r->intflags & PREGf_ANCH_GPOS)
15544             PerlIO_printf(Perl_debug_log, "(GPOS)");
15545         PerlIO_putc(Perl_debug_log, ' ');
15546     }
15547     if (r->intflags & PREGf_GPOS_SEEN)
15548         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15549     if (r->intflags & PREGf_SKIP)
15550         PerlIO_printf(Perl_debug_log, "plus ");
15551     if (r->intflags & PREGf_IMPLICIT)
15552         PerlIO_printf(Perl_debug_log, "implicit ");
15553     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15554     if (r->extflags & RXf_EVAL_SEEN)
15555         PerlIO_printf(Perl_debug_log, "with eval ");
15556     PerlIO_printf(Perl_debug_log, "\n");
15557     DEBUG_FLAGS_r({
15558         regdump_extflags("r->extflags: ",r->extflags);
15559         regdump_intflags("r->intflags: ",r->intflags);
15560     });
15561 #else
15562     PERL_ARGS_ASSERT_REGDUMP;
15563     PERL_UNUSED_CONTEXT;
15564     PERL_UNUSED_ARG(r);
15565 #endif  /* DEBUGGING */
15566 }
15567
15568 /*
15569 - regprop - printable representation of opcode, with run time support
15570 */
15571
15572 void
15573 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15574 {
15575 #ifdef DEBUGGING
15576     dVAR;
15577     int k;
15578
15579     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15580     static const char * const anyofs[] = {
15581 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15582     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15583     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15584     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15585     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15586     || _CC_VERTSPACE != 16
15587   #error Need to adjust order of anyofs[]
15588 #endif
15589         "\\w",
15590         "\\W",
15591         "\\d",
15592         "\\D",
15593         "[:alpha:]",
15594         "[:^alpha:]",
15595         "[:lower:]",
15596         "[:^lower:]",
15597         "[:upper:]",
15598         "[:^upper:]",
15599         "[:punct:]",
15600         "[:^punct:]",
15601         "[:print:]",
15602         "[:^print:]",
15603         "[:alnum:]",
15604         "[:^alnum:]",
15605         "[:graph:]",
15606         "[:^graph:]",
15607         "[:cased:]",
15608         "[:^cased:]",
15609         "\\s",
15610         "\\S",
15611         "[:blank:]",
15612         "[:^blank:]",
15613         "[:xdigit:]",
15614         "[:^xdigit:]",
15615         "[:space:]",
15616         "[:^space:]",
15617         "[:cntrl:]",
15618         "[:^cntrl:]",
15619         "[:ascii:]",
15620         "[:^ascii:]",
15621         "\\v",
15622         "\\V"
15623     };
15624     RXi_GET_DECL(prog,progi);
15625     GET_RE_DEBUG_FLAGS_DECL;
15626
15627     PERL_ARGS_ASSERT_REGPROP;
15628
15629     sv_setpvs(sv, "");
15630
15631     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15632         /* It would be nice to FAIL() here, but this may be called from
15633            regexec.c, and it would be hard to supply pRExC_state. */
15634         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15635                                               (int)OP(o), (int)REGNODE_MAX);
15636     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15637
15638     k = PL_regkind[OP(o)];
15639
15640     if (k == EXACT) {
15641         sv_catpvs(sv, " ");
15642         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15643          * is a crude hack but it may be the best for now since
15644          * we have no flag "this EXACTish node was UTF-8"
15645          * --jhi */
15646         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15647                   PERL_PV_ESCAPE_UNI_DETECT |
15648                   PERL_PV_ESCAPE_NONASCII   |
15649                   PERL_PV_PRETTY_ELLIPSES   |
15650                   PERL_PV_PRETTY_LTGT       |
15651                   PERL_PV_PRETTY_NOCLEAR
15652                   );
15653     } else if (k == TRIE) {
15654         /* print the details of the trie in dumpuntil instead, as
15655          * progi->data isn't available here */
15656         const char op = OP(o);
15657         const U32 n = ARG(o);
15658         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15659                (reg_ac_data *)progi->data->data[n] :
15660                NULL;
15661         const reg_trie_data * const trie
15662             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15663
15664         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15665         DEBUG_TRIE_COMPILE_r(
15666           Perl_sv_catpvf(aTHX_ sv,
15667             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15668             (UV)trie->startstate,
15669             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15670             (UV)trie->wordcount,
15671             (UV)trie->minlen,
15672             (UV)trie->maxlen,
15673             (UV)TRIE_CHARCOUNT(trie),
15674             (UV)trie->uniquecharcount
15675           );
15676         );
15677         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15678             sv_catpvs(sv, "[");
15679             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15680                                                    ? ANYOF_BITMAP(o)
15681                                                    : TRIE_BITMAP(trie));
15682             sv_catpvs(sv, "]");
15683         }
15684
15685     } else if (k == CURLY) {
15686         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15687             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15688         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15689     }
15690     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15691         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15692     else if (k == REF || k == OPEN || k == CLOSE
15693              || k == GROUPP || OP(o)==ACCEPT)
15694     {
15695         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15696         if ( RXp_PAREN_NAMES(prog) ) {
15697             if ( k != REF || (OP(o) < NREF)) {
15698                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15699                 SV **name= av_fetch(list, ARG(o), 0 );
15700                 if (name)
15701                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15702             }
15703             else {
15704                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15705                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15706                 I32 *nums=(I32*)SvPVX(sv_dat);
15707                 SV **name= av_fetch(list, nums[0], 0 );
15708                 I32 n;
15709                 if (name) {
15710                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15711                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15712                                     (n ? "," : ""), (IV)nums[n]);
15713                     }
15714                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15715                 }
15716             }
15717         }
15718         if ( k == REF && reginfo) {
15719             U32 n = ARG(o);  /* which paren pair */
15720             I32 ln = prog->offs[n].start;
15721             if (prog->lastparen < n || ln == -1)
15722                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15723             else if (ln == prog->offs[n].end)
15724                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15725             else {
15726                 const char *s = reginfo->strbeg + ln;
15727                 Perl_sv_catpvf(aTHX_ sv, ": ");
15728                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15729                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15730             }
15731         }
15732     } else if (k == GOSUB)
15733         /* Paren and offset */
15734         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15735     else if (k == VERB) {
15736         if (!o->flags)
15737             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15738                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15739     } else if (k == LOGICAL)
15740         /* 2: embedded, otherwise 1 */
15741         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15742     else if (k == ANYOF) {
15743         const U8 flags = ANYOF_FLAGS(o);
15744         int do_sep = 0;
15745
15746
15747         if (flags & ANYOF_LOCALE_FLAGS)
15748             sv_catpvs(sv, "{loc}");
15749         if (flags & ANYOF_LOC_FOLD)
15750             sv_catpvs(sv, "{i}");
15751         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15752         if (flags & ANYOF_INVERT)
15753             sv_catpvs(sv, "^");
15754
15755         /* output what the standard cp 0-255 bitmap matches */
15756         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15757
15758         /* output any special charclass tests (used entirely under use
15759          * locale) * */
15760         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15761             int i;
15762             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15763                 if (ANYOF_POSIXL_TEST(o,i)) {
15764                     sv_catpv(sv, anyofs[i]);
15765                     do_sep = 1;
15766                 }
15767             }
15768         }
15769
15770         if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15771                       |ANYOF_UTF8
15772                       |ANYOF_NONBITMAP_NON_UTF8
15773                       |ANYOF_LOC_FOLD)))
15774         {
15775             if (do_sep) {
15776                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15777                 if (flags & ANYOF_INVERT)
15778                     /*make sure the invert info is in each */
15779                     sv_catpvs(sv, "^");
15780             }
15781
15782             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15783                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15784             }
15785
15786             /* output information about the unicode matching */
15787             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15788                 sv_catpvs(sv, "{unicode_all}");
15789             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15790                 SV *lv; /* Set if there is something outside the bit map. */
15791                 bool byte_output = FALSE;   /* If something in the bitmap has
15792                                                been output */
15793                 SV *only_utf8_locale;
15794
15795                 /* Get the stuff that wasn't in the bitmap */
15796                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15797                                                     &lv, &only_utf8_locale);
15798                 if (lv && lv != &PL_sv_undef) {
15799                     char *s = savesvpv(lv);
15800                     char * const origs = s;
15801
15802                     while (*s && *s != '\n')
15803                         s++;
15804
15805                     if (*s == '\n') {
15806                         const char * const t = ++s;
15807
15808                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15809                             sv_catpvs(sv, "{outside bitmap}");
15810                         }
15811                         else {
15812                             sv_catpvs(sv, "{utf8}");
15813                         }
15814
15815                         if (byte_output) {
15816                             sv_catpvs(sv, " ");
15817                         }
15818
15819                         while (*s) {
15820                             if (*s == '\n') {
15821
15822                                 /* Truncate very long output */
15823                                 if (s - origs > 256) {
15824                                     Perl_sv_catpvf(aTHX_ sv,
15825                                                 "%.*s...",
15826                                                 (int) (s - origs - 1),
15827                                                 t);
15828                                     goto out_dump;
15829                                 }
15830                                 *s = ' ';
15831                             }
15832                             else if (*s == '\t') {
15833                                 *s = '-';
15834                             }
15835                             s++;
15836                         }
15837                         if (s[-1] == ' ')
15838                             s[-1] = 0;
15839
15840                         sv_catpv(sv, t);
15841                     }
15842
15843                 out_dump:
15844
15845                     Safefree(origs);
15846                     SvREFCNT_dec_NN(lv);
15847                 }
15848
15849                 if ((flags & ANYOF_LOC_FOLD)
15850                      && only_utf8_locale
15851                      && only_utf8_locale != &PL_sv_undef)
15852                 {
15853                     UV start, end;
15854                     int max_entries = 256;
15855
15856                     sv_catpvs(sv, "{utf8 locale}");
15857                     invlist_iterinit(only_utf8_locale);
15858                     while (invlist_iternext(only_utf8_locale,
15859                                             &start, &end)) {
15860                         put_range(sv, start, end);
15861                         max_entries --;
15862                         if (max_entries < 0) {
15863                             sv_catpvs(sv, "...");
15864                             break;
15865                         }
15866                     }
15867                     invlist_iterfinish(only_utf8_locale);
15868                 }
15869             }
15870         }
15871
15872         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15873     }
15874     else if (k == POSIXD || k == NPOSIXD) {
15875         U8 index = FLAGS(o) * 2;
15876         if (index < C_ARRAY_LENGTH(anyofs)) {
15877             if (*anyofs[index] != '[')  {
15878                 sv_catpv(sv, "[");
15879             }
15880             sv_catpv(sv, anyofs[index]);
15881             if (*anyofs[index] != '[')  {
15882                 sv_catpv(sv, "]");
15883             }
15884         }
15885         else {
15886             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15887         }
15888     }
15889     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15890         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15891 #else
15892     PERL_UNUSED_CONTEXT;
15893     PERL_UNUSED_ARG(sv);
15894     PERL_UNUSED_ARG(o);
15895     PERL_UNUSED_ARG(prog);
15896     PERL_UNUSED_ARG(reginfo);
15897 #endif  /* DEBUGGING */
15898 }
15899
15900
15901
15902 SV *
15903 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15904 {                               /* Assume that RE_INTUIT is set */
15905     dVAR;
15906     struct regexp *const prog = ReANY(r);
15907     GET_RE_DEBUG_FLAGS_DECL;
15908
15909     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15910     PERL_UNUSED_CONTEXT;
15911
15912     DEBUG_COMPILE_r(
15913         {
15914             const char * const s = SvPV_nolen_const(prog->check_substr
15915                       ? prog->check_substr : prog->check_utf8);
15916
15917             if (!PL_colorset) reginitcolors();
15918             PerlIO_printf(Perl_debug_log,
15919                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15920                       PL_colors[4],
15921                       prog->check_substr ? "" : "utf8 ",
15922                       PL_colors[5],PL_colors[0],
15923                       s,
15924                       PL_colors[1],
15925                       (strlen(s) > 60 ? "..." : ""));
15926         } );
15927
15928     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15929 }
15930
15931 /*
15932    pregfree()
15933
15934    handles refcounting and freeing the perl core regexp structure. When
15935    it is necessary to actually free the structure the first thing it
15936    does is call the 'free' method of the regexp_engine associated to
15937    the regexp, allowing the handling of the void *pprivate; member
15938    first. (This routine is not overridable by extensions, which is why
15939    the extensions free is called first.)
15940
15941    See regdupe and regdupe_internal if you change anything here.
15942 */
15943 #ifndef PERL_IN_XSUB_RE
15944 void
15945 Perl_pregfree(pTHX_ REGEXP *r)
15946 {
15947     SvREFCNT_dec(r);
15948 }
15949
15950 void
15951 Perl_pregfree2(pTHX_ REGEXP *rx)
15952 {
15953     dVAR;
15954     struct regexp *const r = ReANY(rx);
15955     GET_RE_DEBUG_FLAGS_DECL;
15956
15957     PERL_ARGS_ASSERT_PREGFREE2;
15958
15959     if (r->mother_re) {
15960         ReREFCNT_dec(r->mother_re);
15961     } else {
15962         CALLREGFREE_PVT(rx); /* free the private data */
15963         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15964         Safefree(r->xpv_len_u.xpvlenu_pv);
15965     }
15966     if (r->substrs) {
15967         SvREFCNT_dec(r->anchored_substr);
15968         SvREFCNT_dec(r->anchored_utf8);
15969         SvREFCNT_dec(r->float_substr);
15970         SvREFCNT_dec(r->float_utf8);
15971         Safefree(r->substrs);
15972     }
15973     RX_MATCH_COPY_FREE(rx);
15974 #ifdef PERL_ANY_COW
15975     SvREFCNT_dec(r->saved_copy);
15976 #endif
15977     Safefree(r->offs);
15978     SvREFCNT_dec(r->qr_anoncv);
15979     rx->sv_u.svu_rx = 0;
15980 }
15981
15982 /*  reg_temp_copy()
15983
15984     This is a hacky workaround to the structural issue of match results
15985     being stored in the regexp structure which is in turn stored in
15986     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15987     could be PL_curpm in multiple contexts, and could require multiple
15988     result sets being associated with the pattern simultaneously, such
15989     as when doing a recursive match with (??{$qr})
15990
15991     The solution is to make a lightweight copy of the regexp structure
15992     when a qr// is returned from the code executed by (??{$qr}) this
15993     lightweight copy doesn't actually own any of its data except for
15994     the starp/end and the actual regexp structure itself.
15995
15996 */
15997
15998
15999 REGEXP *
16000 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16001 {
16002     struct regexp *ret;
16003     struct regexp *const r = ReANY(rx);
16004     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16005
16006     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16007
16008     if (!ret_x)
16009         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16010     else {
16011         SvOK_off((SV *)ret_x);
16012         if (islv) {
16013             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16014                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16015                made both spots point to the same regexp body.) */
16016             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16017             assert(!SvPVX(ret_x));
16018             ret_x->sv_u.svu_rx = temp->sv_any;
16019             temp->sv_any = NULL;
16020             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16021             SvREFCNT_dec_NN(temp);
16022             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16023                ing below will not set it. */
16024             SvCUR_set(ret_x, SvCUR(rx));
16025         }
16026     }
16027     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16028        sv_force_normal(sv) is called.  */
16029     SvFAKE_on(ret_x);
16030     ret = ReANY(ret_x);
16031
16032     SvFLAGS(ret_x) |= SvUTF8(rx);
16033     /* We share the same string buffer as the original regexp, on which we
16034        hold a reference count, incremented when mother_re is set below.
16035        The string pointer is copied here, being part of the regexp struct.
16036      */
16037     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16038            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16039     if (r->offs) {
16040         const I32 npar = r->nparens+1;
16041         Newx(ret->offs, npar, regexp_paren_pair);
16042         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16043     }
16044     if (r->substrs) {
16045         Newx(ret->substrs, 1, struct reg_substr_data);
16046         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16047
16048         SvREFCNT_inc_void(ret->anchored_substr);
16049         SvREFCNT_inc_void(ret->anchored_utf8);
16050         SvREFCNT_inc_void(ret->float_substr);
16051         SvREFCNT_inc_void(ret->float_utf8);
16052
16053         /* check_substr and check_utf8, if non-NULL, point to either their
16054            anchored or float namesakes, and don't hold a second reference.  */
16055     }
16056     RX_MATCH_COPIED_off(ret_x);
16057 #ifdef PERL_ANY_COW
16058     ret->saved_copy = NULL;
16059 #endif
16060     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16061     SvREFCNT_inc_void(ret->qr_anoncv);
16062
16063     return ret_x;
16064 }
16065 #endif
16066
16067 /* regfree_internal()
16068
16069    Free the private data in a regexp. This is overloadable by
16070    extensions. Perl takes care of the regexp structure in pregfree(),
16071    this covers the *pprivate pointer which technically perl doesn't
16072    know about, however of course we have to handle the
16073    regexp_internal structure when no extension is in use.
16074
16075    Note this is called before freeing anything in the regexp
16076    structure.
16077  */
16078
16079 void
16080 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16081 {
16082     dVAR;
16083     struct regexp *const r = ReANY(rx);
16084     RXi_GET_DECL(r,ri);
16085     GET_RE_DEBUG_FLAGS_DECL;
16086
16087     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16088
16089     DEBUG_COMPILE_r({
16090         if (!PL_colorset)
16091             reginitcolors();
16092         {
16093             SV *dsv= sv_newmortal();
16094             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16095                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16096             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16097                 PL_colors[4],PL_colors[5],s);
16098         }
16099     });
16100 #ifdef RE_TRACK_PATTERN_OFFSETS
16101     if (ri->u.offsets)
16102         Safefree(ri->u.offsets);             /* 20010421 MJD */
16103 #endif
16104     if (ri->code_blocks) {
16105         int n;
16106         for (n = 0; n < ri->num_code_blocks; n++)
16107             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16108         Safefree(ri->code_blocks);
16109     }
16110
16111     if (ri->data) {
16112         int n = ri->data->count;
16113
16114         while (--n >= 0) {
16115           /* If you add a ->what type here, update the comment in regcomp.h */
16116             switch (ri->data->what[n]) {
16117             case 'a':
16118             case 'r':
16119             case 's':
16120             case 'S':
16121             case 'u':
16122                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16123                 break;
16124             case 'f':
16125                 Safefree(ri->data->data[n]);
16126                 break;
16127             case 'l':
16128             case 'L':
16129                 break;
16130             case 'T':
16131                 { /* Aho Corasick add-on structure for a trie node.
16132                      Used in stclass optimization only */
16133                     U32 refcount;
16134                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16135                     OP_REFCNT_LOCK;
16136                     refcount = --aho->refcount;
16137                     OP_REFCNT_UNLOCK;
16138                     if ( !refcount ) {
16139                         PerlMemShared_free(aho->states);
16140                         PerlMemShared_free(aho->fail);
16141                          /* do this last!!!! */
16142                         PerlMemShared_free(ri->data->data[n]);
16143                         PerlMemShared_free(ri->regstclass);
16144                     }
16145                 }
16146                 break;
16147             case 't':
16148                 {
16149                     /* trie structure. */
16150                     U32 refcount;
16151                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16152                     OP_REFCNT_LOCK;
16153                     refcount = --trie->refcount;
16154                     OP_REFCNT_UNLOCK;
16155                     if ( !refcount ) {
16156                         PerlMemShared_free(trie->charmap);
16157                         PerlMemShared_free(trie->states);
16158                         PerlMemShared_free(trie->trans);
16159                         if (trie->bitmap)
16160                             PerlMemShared_free(trie->bitmap);
16161                         if (trie->jump)
16162                             PerlMemShared_free(trie->jump);
16163                         PerlMemShared_free(trie->wordinfo);
16164                         /* do this last!!!! */
16165                         PerlMemShared_free(ri->data->data[n]);
16166                     }
16167                 }
16168                 break;
16169             default:
16170                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16171                                                     ri->data->what[n]);
16172             }
16173         }
16174         Safefree(ri->data->what);
16175         Safefree(ri->data);
16176     }
16177
16178     Safefree(ri);
16179 }
16180
16181 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16182 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16183 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16184
16185 /*
16186    re_dup - duplicate a regexp.
16187
16188    This routine is expected to clone a given regexp structure. It is only
16189    compiled under USE_ITHREADS.
16190
16191    After all of the core data stored in struct regexp is duplicated
16192    the regexp_engine.dupe method is used to copy any private data
16193    stored in the *pprivate pointer. This allows extensions to handle
16194    any duplication it needs to do.
16195
16196    See pregfree() and regfree_internal() if you change anything here.
16197 */
16198 #if defined(USE_ITHREADS)
16199 #ifndef PERL_IN_XSUB_RE
16200 void
16201 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16202 {
16203     dVAR;
16204     I32 npar;
16205     const struct regexp *r = ReANY(sstr);
16206     struct regexp *ret = ReANY(dstr);
16207
16208     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16209
16210     npar = r->nparens+1;
16211     Newx(ret->offs, npar, regexp_paren_pair);
16212     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16213
16214     if (ret->substrs) {
16215         /* Do it this way to avoid reading from *r after the StructCopy().
16216            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16217            cache, it doesn't matter.  */
16218         const bool anchored = r->check_substr
16219             ? r->check_substr == r->anchored_substr
16220             : r->check_utf8 == r->anchored_utf8;
16221         Newx(ret->substrs, 1, struct reg_substr_data);
16222         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16223
16224         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16225         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16226         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16227         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16228
16229         /* check_substr and check_utf8, if non-NULL, point to either their
16230            anchored or float namesakes, and don't hold a second reference.  */
16231
16232         if (ret->check_substr) {
16233             if (anchored) {
16234                 assert(r->check_utf8 == r->anchored_utf8);
16235                 ret->check_substr = ret->anchored_substr;
16236                 ret->check_utf8 = ret->anchored_utf8;
16237             } else {
16238                 assert(r->check_substr == r->float_substr);
16239                 assert(r->check_utf8 == r->float_utf8);
16240                 ret->check_substr = ret->float_substr;
16241                 ret->check_utf8 = ret->float_utf8;
16242             }
16243         } else if (ret->check_utf8) {
16244             if (anchored) {
16245                 ret->check_utf8 = ret->anchored_utf8;
16246             } else {
16247                 ret->check_utf8 = ret->float_utf8;
16248             }
16249         }
16250     }
16251
16252     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16253     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16254
16255     if (ret->pprivate)
16256         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16257
16258     if (RX_MATCH_COPIED(dstr))
16259         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16260     else
16261         ret->subbeg = NULL;
16262 #ifdef PERL_ANY_COW
16263     ret->saved_copy = NULL;
16264 #endif
16265
16266     /* Whether mother_re be set or no, we need to copy the string.  We
16267        cannot refrain from copying it when the storage points directly to
16268        our mother regexp, because that's
16269                1: a buffer in a different thread
16270                2: something we no longer hold a reference on
16271                so we need to copy it locally.  */
16272     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16273     ret->mother_re   = NULL;
16274 }
16275 #endif /* PERL_IN_XSUB_RE */
16276
16277 /*
16278    regdupe_internal()
16279
16280    This is the internal complement to regdupe() which is used to copy
16281    the structure pointed to by the *pprivate pointer in the regexp.
16282    This is the core version of the extension overridable cloning hook.
16283    The regexp structure being duplicated will be copied by perl prior
16284    to this and will be provided as the regexp *r argument, however
16285    with the /old/ structures pprivate pointer value. Thus this routine
16286    may override any copying normally done by perl.
16287
16288    It returns a pointer to the new regexp_internal structure.
16289 */
16290
16291 void *
16292 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16293 {
16294     dVAR;
16295     struct regexp *const r = ReANY(rx);
16296     regexp_internal *reti;
16297     int len;
16298     RXi_GET_DECL(r,ri);
16299
16300     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16301
16302     len = ProgLen(ri);
16303
16304     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16305           char, regexp_internal);
16306     Copy(ri->program, reti->program, len+1, regnode);
16307
16308     reti->num_code_blocks = ri->num_code_blocks;
16309     if (ri->code_blocks) {
16310         int n;
16311         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16312                 struct reg_code_block);
16313         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16314                 struct reg_code_block);
16315         for (n = 0; n < ri->num_code_blocks; n++)
16316              reti->code_blocks[n].src_regex = (REGEXP*)
16317                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16318     }
16319     else
16320         reti->code_blocks = NULL;
16321
16322     reti->regstclass = NULL;
16323
16324     if (ri->data) {
16325         struct reg_data *d;
16326         const int count = ri->data->count;
16327         int i;
16328
16329         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16330                 char, struct reg_data);
16331         Newx(d->what, count, U8);
16332
16333         d->count = count;
16334         for (i = 0; i < count; i++) {
16335             d->what[i] = ri->data->what[i];
16336             switch (d->what[i]) {
16337                 /* see also regcomp.h and regfree_internal() */
16338             case 'a': /* actually an AV, but the dup function is identical.  */
16339             case 'r':
16340             case 's':
16341             case 'S':
16342             case 'u': /* actually an HV, but the dup function is identical.  */
16343                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16344                 break;
16345             case 'f':
16346                 /* This is cheating. */
16347                 Newx(d->data[i], 1, regnode_ssc);
16348                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16349                 reti->regstclass = (regnode*)d->data[i];
16350                 break;
16351             case 'T':
16352                 /* Trie stclasses are readonly and can thus be shared
16353                  * without duplication. We free the stclass in pregfree
16354                  * when the corresponding reg_ac_data struct is freed.
16355                  */
16356                 reti->regstclass= ri->regstclass;
16357                 /* Fall through */
16358             case 't':
16359                 OP_REFCNT_LOCK;
16360                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16361                 OP_REFCNT_UNLOCK;
16362                 /* Fall through */
16363             case 'l':
16364             case 'L':
16365                 d->data[i] = ri->data->data[i];
16366                 break;
16367             default:
16368                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16369                                                            ri->data->what[i]);
16370             }
16371         }
16372
16373         reti->data = d;
16374     }
16375     else
16376         reti->data = NULL;
16377
16378     reti->name_list_idx = ri->name_list_idx;
16379
16380 #ifdef RE_TRACK_PATTERN_OFFSETS
16381     if (ri->u.offsets) {
16382         Newx(reti->u.offsets, 2*len+1, U32);
16383         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16384     }
16385 #else
16386     SetProgLen(reti,len);
16387 #endif
16388
16389     return (void*)reti;
16390 }
16391
16392 #endif    /* USE_ITHREADS */
16393
16394 #ifndef PERL_IN_XSUB_RE
16395
16396 /*
16397  - regnext - dig the "next" pointer out of a node
16398  */
16399 regnode *
16400 Perl_regnext(pTHX_ regnode *p)
16401 {
16402     dVAR;
16403     I32 offset;
16404
16405     if (!p)
16406         return(NULL);
16407
16408     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16409         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16410                                                 (int)OP(p), (int)REGNODE_MAX);
16411     }
16412
16413     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16414     if (offset == 0)
16415         return(NULL);
16416
16417     return(p+offset);
16418 }
16419 #endif
16420
16421 STATIC void
16422 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16423 {
16424     va_list args;
16425     STRLEN l1 = strlen(pat1);
16426     STRLEN l2 = strlen(pat2);
16427     char buf[512];
16428     SV *msv;
16429     const char *message;
16430
16431     PERL_ARGS_ASSERT_RE_CROAK2;
16432
16433     if (l1 > 510)
16434         l1 = 510;
16435     if (l1 + l2 > 510)
16436         l2 = 510 - l1;
16437     Copy(pat1, buf, l1 , char);
16438     Copy(pat2, buf + l1, l2 , char);
16439     buf[l1 + l2] = '\n';
16440     buf[l1 + l2 + 1] = '\0';
16441     va_start(args, pat2);
16442     msv = vmess(buf, &args);
16443     va_end(args);
16444     message = SvPV_const(msv,l1);
16445     if (l1 > 512)
16446         l1 = 512;
16447     Copy(message, buf, l1 , char);
16448     /* l1-1 to avoid \n */
16449     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16450 }
16451
16452 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16453
16454 #ifndef PERL_IN_XSUB_RE
16455 void
16456 Perl_save_re_context(pTHX)
16457 {
16458     dVAR;
16459
16460     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16461     if (PL_curpm) {
16462         const REGEXP * const rx = PM_GETRE(PL_curpm);
16463         if (rx) {
16464             U32 i;
16465             for (i = 1; i <= RX_NPARENS(rx); i++) {
16466                 char digits[TYPE_CHARS(long)];
16467                 const STRLEN len = my_snprintf(digits, sizeof(digits),
16468                                                "%lu", (long)i);
16469                 GV *const *const gvp
16470                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16471
16472                 if (gvp) {
16473                     GV * const gv = *gvp;
16474                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16475                         save_scalar(gv);
16476                 }
16477             }
16478         }
16479     }
16480 }
16481 #endif
16482
16483 #ifdef DEBUGGING
16484
16485 STATIC void
16486 S_put_byte(pTHX_ SV *sv, int c)
16487 {
16488     PERL_ARGS_ASSERT_PUT_BYTE;
16489
16490     if (!isPRINT(c)) {
16491         switch (c) {
16492             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16493             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16494             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16495             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16496             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16497
16498             default:
16499                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16500                 break;
16501         }
16502     }
16503     else {
16504         const char string = c;
16505         if (c == '-' || c == ']' || c == '\\' || c == '^')
16506             sv_catpvs(sv, "\\");
16507         sv_catpvn(sv, &string, 1);
16508     }
16509 }
16510
16511 STATIC void
16512 S_put_range(pTHX_ SV *sv, UV start, UV end)
16513 {
16514
16515     /* Appends to 'sv' a displayable version of the range of code points from
16516      * 'start' to 'end' */
16517
16518     assert(start <= end);
16519
16520     PERL_ARGS_ASSERT_PUT_RANGE;
16521
16522     if (end - start < 3) {  /* Individual chars in short ranges */
16523         for (; start <= end; start++)
16524             put_byte(sv, start);
16525     }
16526     else if (   end > 255
16527              || ! isALPHANUMERIC(start)
16528              || ! isALPHANUMERIC(end)
16529              || isDIGIT(start) != isDIGIT(end)
16530              || isUPPER(start) != isUPPER(end)
16531              || isLOWER(start) != isLOWER(end)
16532
16533                 /* This final test should get optimized out except on EBCDIC
16534                  * platforms, where it causes ranges that cross discontinuities
16535                  * like i/j to be shown as hex instead of the misleading,
16536                  * e.g. H-K (since that range includes more than H, I, J, K).
16537                  * */
16538              || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16539     {
16540         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16541                        start,
16542                        (end < 256) ? end : 255);
16543     }
16544     else { /* Here, the ends of the range are both digits, or both uppercase,
16545               or both lowercase; and there's no discontinuity in the range
16546               (which could happen on EBCDIC platforms) */
16547         put_byte(sv, start);
16548         sv_catpvs(sv, "-");
16549         put_byte(sv, end);
16550     }
16551 }
16552
16553 STATIC bool
16554 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16555 {
16556     /* Appends to 'sv' a displayable version of the innards of the bracketed
16557      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16558      * output anything */
16559
16560     int i;
16561     bool has_output_anything = FALSE;
16562
16563     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16564
16565     for (i = 0; i < 256; i++) {
16566         if (BITMAP_TEST((U8 *) bitmap,i)) {
16567
16568             /* The character at index i should be output.  Find the next
16569              * character that should NOT be output */
16570             int j;
16571             for (j = i + 1; j < 256; j++) {
16572                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16573                     break;
16574                 }
16575             }
16576
16577             /* Everything between them is a single range that should be output
16578              * */
16579             put_range(sv, i, j - 1);
16580             has_output_anything = TRUE;
16581             i = j;
16582         }
16583     }
16584
16585     return has_output_anything;
16586 }
16587
16588 #define CLEAR_OPTSTART \
16589     if (optstart) STMT_START {                                               \
16590         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16591                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16592         optstart=NULL;                                                       \
16593     } STMT_END
16594
16595 #define DUMPUNTIL(b,e)                                                       \
16596                     CLEAR_OPTSTART;                                          \
16597                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16598
16599 STATIC const regnode *
16600 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16601             const regnode *last, const regnode *plast,
16602             SV* sv, I32 indent, U32 depth)
16603 {
16604     dVAR;
16605     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16606     const regnode *next;
16607     const regnode *optstart= NULL;
16608
16609     RXi_GET_DECL(r,ri);
16610     GET_RE_DEBUG_FLAGS_DECL;
16611
16612     PERL_ARGS_ASSERT_DUMPUNTIL;
16613
16614 #ifdef DEBUG_DUMPUNTIL
16615     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16616         last ? last-start : 0,plast ? plast-start : 0);
16617 #endif
16618
16619     if (plast && plast < last)
16620         last= plast;
16621
16622     while (PL_regkind[op] != END && (!last || node < last)) {
16623         /* While that wasn't END last time... */
16624         NODE_ALIGN(node);
16625         op = OP(node);
16626         if (op == CLOSE || op == WHILEM)
16627             indent--;
16628         next = regnext((regnode *)node);
16629
16630         /* Where, what. */
16631         if (OP(node) == OPTIMIZED) {
16632             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16633                 optstart = node;
16634             else
16635                 goto after_print;
16636         } else
16637             CLEAR_OPTSTART;
16638
16639         regprop(r, sv, node, NULL);
16640         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16641                       (int)(2*indent + 1), "", SvPVX_const(sv));
16642
16643         if (OP(node) != OPTIMIZED) {
16644             if (next == NULL)           /* Next ptr. */
16645                 PerlIO_printf(Perl_debug_log, " (0)");
16646             else if (PL_regkind[(U8)op] == BRANCH
16647                      && PL_regkind[OP(next)] != BRANCH )
16648                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16649             else
16650                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16651             (void)PerlIO_putc(Perl_debug_log, '\n');
16652         }
16653
16654       after_print:
16655         if (PL_regkind[(U8)op] == BRANCHJ) {
16656             assert(next);
16657             {
16658                 const regnode *nnode = (OP(next) == LONGJMP
16659                                        ? regnext((regnode *)next)
16660                                        : next);
16661                 if (last && nnode > last)
16662                     nnode = last;
16663                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16664             }
16665         }
16666         else if (PL_regkind[(U8)op] == BRANCH) {
16667             assert(next);
16668             DUMPUNTIL(NEXTOPER(node), next);
16669         }
16670         else if ( PL_regkind[(U8)op]  == TRIE ) {
16671             const regnode *this_trie = node;
16672             const char op = OP(node);
16673             const U32 n = ARG(node);
16674             const reg_ac_data * const ac = op>=AHOCORASICK ?
16675                (reg_ac_data *)ri->data->data[n] :
16676                NULL;
16677             const reg_trie_data * const trie =
16678                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16679 #ifdef DEBUGGING
16680             AV *const trie_words
16681                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16682 #endif
16683             const regnode *nextbranch= NULL;
16684             I32 word_idx;
16685             sv_setpvs(sv, "");
16686             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16687                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16688
16689                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16690                    (int)(2*(indent+3)), "",
16691                     elem_ptr
16692                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16693                                 SvCUR(*elem_ptr), 60,
16694                                 PL_colors[0], PL_colors[1],
16695                                 (SvUTF8(*elem_ptr)
16696                                  ? PERL_PV_ESCAPE_UNI
16697                                  : 0)
16698                                 | PERL_PV_PRETTY_ELLIPSES
16699                                 | PERL_PV_PRETTY_LTGT
16700                             )
16701                     : "???"
16702                 );
16703                 if (trie->jump) {
16704                     U16 dist= trie->jump[word_idx+1];
16705                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16706                                (UV)((dist ? this_trie + dist : next) - start));
16707                     if (dist) {
16708                         if (!nextbranch)
16709                             nextbranch= this_trie + trie->jump[0];
16710                         DUMPUNTIL(this_trie + dist, nextbranch);
16711                     }
16712                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16713                         nextbranch= regnext((regnode *)nextbranch);
16714                 } else {
16715                     PerlIO_printf(Perl_debug_log, "\n");
16716                 }
16717             }
16718             if (last && next > last)
16719                 node= last;
16720             else
16721                 node= next;
16722         }
16723         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16724             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16725                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16726         }
16727         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16728             assert(next);
16729             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16730         }
16731         else if ( op == PLUS || op == STAR) {
16732             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16733         }
16734         else if (PL_regkind[(U8)op] == ANYOF) {
16735             /* arglen 1 + class block */
16736             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16737                           ? ANYOF_POSIXL_SKIP
16738                           : ANYOF_SKIP);
16739             node = NEXTOPER(node);
16740         }
16741         else if (PL_regkind[(U8)op] == EXACT) {
16742             /* Literal string, where present. */
16743             node += NODE_SZ_STR(node) - 1;
16744             node = NEXTOPER(node);
16745         }
16746         else {
16747             node = NEXTOPER(node);
16748             node += regarglen[(U8)op];
16749         }
16750         if (op == CURLYX || op == OPEN)
16751             indent++;
16752     }
16753     CLEAR_OPTSTART;
16754 #ifdef DEBUG_DUMPUNTIL
16755     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16756 #endif
16757     return node;
16758 }
16759
16760 #endif  /* DEBUGGING */
16761
16762 /*
16763  * Local variables:
16764  * c-indentation-style: bsd
16765  * c-basic-offset: 4
16766  * indent-tabs-mode: nil
16767  * End:
16768  *
16769  * ex: set ts=8 sts=4 sw=4 et:
16770  */