src/5021002/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105
 106 struct RExC_state_t {
 107     U32         flags;                  /* RXf_* are we folding, multilining? */
 108     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object
 113                                            pprivate field */
 114     char        *start;                 /* Start of input for compile */
 115     char        *end;                   /* End of input for compile */
 116     char        *parse;                 /* Input-scan pointer. */
 117     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 118     regnode     *emit_start;            /* Start of emitted-code area */
 119     regnode     *emit_bound;            /* First regnode outside of the
 120                                            allocated space */
 121     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 122                                            implies compiling, so don't emit */
 123     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 124                                            large enough for the largest
 125                                            non-EXACTish node, so can use it as
 126                                            scratch in pass1 */
 127     I32         naughty;                /* How bad is this pattern? */
 128     I32         sawback;                /* Did we see \1, ...? */
 129     U32         seen;
 130     SSize_t     size;                   /* Code size. */
 131     I32                npar;            /* Capture buffer count, (OPEN) plus
 132                                            one. ("par" 0 is the whole
 133                                            pattern)*/
 134     I32         nestroot;               /* root parens we are in - used by
 135                                            accept */
 136     I32         extralen;
 137     I32         seen_zerolen;
 138     regnode     **open_parens;          /* pointers to open parens */
 139     regnode     **close_parens;         /* pointers to close parens */
 140     regnode     *opend;                 /* END node in program */
 141     I32         utf8;           /* whether the pattern is utf8 or not */
 142     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 143                                 /* XXX use this for future optimisation of case
 144                                  * where pattern must be upgraded to utf8. */
 145     I32         uni_semantics;  /* If a d charset modifier should use unicode
 146                                    rules, even if the pattern is not in
 147                                    utf8 */
 148     HV          *paren_names;           /* Paren names */
 149
 150     regnode     **recurse;              /* Recurse regops */
 151     I32         recurse_count;          /* Number of recurse regops */
 152     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 153                                            through */
 154     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 155     I32         in_lookbehind;
 156     I32         contains_locale;
 157     I32         contains_i;
 158     I32         override_recoding;
 159     I32         in_multi_char_class;
 160     struct reg_code_block *code_blocks; /* positions of literal (?{})
 161                                             within pattern */
 162     int         num_code_blocks;        /* size of code_blocks[] */
 163     int         code_index;             /* next code_blocks[] slot */
 164     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 165 #ifdef ADD_TO_REGEXEC
 166     char        *starttry;              /* -Dr: where regtry was called. */
 167 #define RExC_starttry   (pRExC_state->starttry)
 168 #endif
 169     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 170 #ifdef DEBUGGING
 171     const char  *lastparse;
 172     I32         lastnum;
 173     AV          *paren_name_list;       /* idx -> name */
 174 #define RExC_lastparse  (pRExC_state->lastparse)
 175 #define RExC_lastnum    (pRExC_state->lastnum)
 176 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 177 #endif
 178 };
 179
 180 #define RExC_flags      (pRExC_state->flags)
 181 #define RExC_pm_flags   (pRExC_state->pm_flags)
 182 #define RExC_precomp    (pRExC_state->precomp)
 183 #define RExC_rx_sv      (pRExC_state->rx_sv)
 184 #define RExC_rx         (pRExC_state->rx)
 185 #define RExC_rxi        (pRExC_state->rxi)
 186 #define RExC_start      (pRExC_state->start)
 187 #define RExC_end        (pRExC_state->end)
 188 #define RExC_parse      (pRExC_state->parse)
 189 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 190 #ifdef RE_TRACK_PATTERN_OFFSETS
 191 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 192                                                          others */
 193 #endif
 194 #define RExC_emit       (pRExC_state->emit)
 195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty    (pRExC_state->naughty)
 199 #define RExC_sawback    (pRExC_state->sawback)
 200 #define RExC_seen       (pRExC_state->seen)
 201 #define RExC_size       (pRExC_state->size)
 202 #define RExC_maxlen        (pRExC_state->maxlen)
 203 #define RExC_npar       (pRExC_state->npar)
 204 #define RExC_nestroot   (pRExC_state->nestroot)
 205 #define RExC_extralen   (pRExC_state->extralen)
 206 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 207 #define RExC_utf8       (pRExC_state->utf8)
 208 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 209 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 210 #define RExC_open_parens        (pRExC_state->open_parens)
 211 #define RExC_close_parens       (pRExC_state->close_parens)
 212 #define RExC_opend      (pRExC_state->opend)
 213 #define RExC_paren_names        (pRExC_state->paren_names)
 214 #define RExC_recurse    (pRExC_state->recurse)
 215 #define RExC_recurse_count      (pRExC_state->recurse_count)
 216 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 217 #define RExC_study_chunk_recursed_bytes  \
 218                                    (pRExC_state->study_chunk_recursed_bytes)
 219 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 220 #define RExC_contains_locale    (pRExC_state->contains_locale)
 221 #define RExC_contains_i (pRExC_state->contains_i)
 222 #define RExC_override_recoding (pRExC_state->override_recoding)
 223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 224
 225
 226 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 227 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 228         ((*s) == '{' && regcurly(s)))
 229
 230 /*
 231  * Flags to be passed up and down.
 232  */
 233 #define WORST           0       /* Worst case. */
 234 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 235
 236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 237  * character.  (There needs to be a case: in the switch statement in regexec.c
 238  * for any node marked SIMPLE.)  Note that this is not the same thing as
 239  * REGNODE_SIMPLE */
 240 #define SIMPLE          0x02
 241 #define SPSTART         0x04    /* Starts with * or + */
 242 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 243 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 244 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 245
 246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 247
 248 /* whether trie related optimizations are enabled */
 249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 250 #define TRIE_STUDY_OPT
 251 #define FULL_TRIE_STUDY
 252 #define TRIE_STCLASS
 253 #endif
 254
 255
 256
 257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 258 #define PBITVAL(paren) (1 << ((paren) & 7))
 259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 262
 263 #define REQUIRE_UTF8    STMT_START {                                       \
 264                                      if (!UTF) {                           \
 265                                          *flagp = RESTART_UTF8;            \
 266                                          return NULL;                      \
 267                                      }                                     \
 268                         } STMT_END
 269
 270 /* This converts the named class defined in regcomp.h to its equivalent class
 271  * number defined in handy.h. */
 272 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 273 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 274
 275 #define _invlist_union_complement_2nd(a, b, output) \
 276                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 277 #define _invlist_intersection_complement_2nd(a, b, output) \
 278                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 279
 280 /* About scan_data_t.
 281
 282   During optimisation we recurse through the regexp program performing
 283   various inplace (keyhole style) optimisations. In addition study_chunk
 284   and scan_commit populate this data structure with information about
 285   what strings MUST appear in the pattern. We look for the longest
 286   string that must appear at a fixed location, and we look for the
 287   longest string that may appear at a floating location. So for instance
 288   in the pattern:
 289
 290     /FOO[xX]A.*B[xX]BAR/
 291
 292   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 293   strings (because they follow a .* construct). study_chunk will identify
 294   both FOO and BAR as being the longest fixed and floating strings respectively.
 295
 296   The strings can be composites, for instance
 297
 298      /(f)(o)(o)/
 299
 300   will result in a composite fixed substring 'foo'.
 301
 302   For each string some basic information is maintained:
 303
 304   - offset or min_offset
 305     This is the position the string must appear at, or not before.
 306     It also implicitly (when combined with minlenp) tells us how many
 307     characters must match before the string we are searching for.
 308     Likewise when combined with minlenp and the length of the string it
 309     tells us how many characters must appear after the string we have
 310     found.
 311
 312   - max_offset
 313     Only used for floating strings. This is the rightmost point that
 314     the string can appear at. If set to SSize_t_MAX it indicates that the
 315     string can occur infinitely far to the right.
 316
 317   - minlenp
 318     A pointer to the minimum number of characters of the pattern that the
 319     string was found inside. This is important as in the case of positive
 320     lookahead or positive lookbehind we can have multiple patterns
 321     involved. Consider
 322
 323     /(?=FOO).*F/
 324
 325     The minimum length of the pattern overall is 3, the minimum length
 326     of the lookahead part is 3, but the minimum length of the part that
 327     will actually match is 1. So 'FOO's minimum length is 3, but the
 328     minimum length for the F is 1. This is important as the minimum length
 329     is used to determine offsets in front of and behind the string being
 330     looked for.  Since strings can be composites this is the length of the
 331     pattern at the time it was committed with a scan_commit. Note that
 332     the length is calculated by study_chunk, so that the minimum lengths
 333     are not known until the full pattern has been compiled, thus the
 334     pointer to the value.
 335
 336   - lookbehind
 337
 338     In the case of lookbehind the string being searched for can be
 339     offset past the start point of the final matching string.
 340     If this value was just blithely removed from the min_offset it would
 341     invalidate some of the calculations for how many chars must match
 342     before or after (as they are derived from min_offset and minlen and
 343     the length of the string being searched for).
 344     When the final pattern is compiled and the data is moved from the
 345     scan_data_t structure into the regexp structure the information
 346     about lookbehind is factored in, with the information that would
 347     have been lost precalculated in the end_shift field for the
 348     associated string.
 349
 350   The fields pos_min and pos_delta are used to store the minimum offset
 351   and the delta to the maximum offset at the current point in the pattern.
 352
 353 */
 354
 355 typedef struct scan_data_t {
 356     /*I32 len_min;      unused */
 357     /*I32 len_delta;    unused */
 358     SSize_t pos_min;
 359     SSize_t pos_delta;
 360     SV *last_found;
 361     SSize_t last_end;       /* min value, <0 unless valid. */
 362     SSize_t last_start_min;
 363     SSize_t last_start_max;
 364     SV **longest;           /* Either &l_fixed, or &l_float. */
 365     SV *longest_fixed;      /* longest fixed string found in pattern */
 366     SSize_t offset_fixed;   /* offset where it starts */
 367     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 368     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 369     SV *longest_float;      /* longest floating string found in pattern */
 370     SSize_t offset_float_min; /* earliest point in string it can appear */
 371     SSize_t offset_float_max; /* latest point in string it can appear */
 372     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 373     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 374     I32 flags;
 375     I32 whilem_c;
 376     SSize_t *last_closep;
 377     regnode_ssc *start_class;
 378 } scan_data_t;
 379
 380 /* The below is perhaps overboard, but this allows us to save a test at the
 381  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 382  * and 'a' differ by a single bit; the same with the upper and lower case of
 383  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 384  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 385  * then inverts it to form a mask, with just a single 0, in the bit position
 386  * where the upper- and lowercase differ.  XXX There are about 40 other
 387  * instances in the Perl core where this micro-optimization could be used.
 388  * Should decide if maintenance cost is worse, before changing those
 389  *
 390  * Returns a boolean as to whether or not 'v' is either a lowercase or
 391  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 392  * compile-time constant, the generated code is better than some optimizing
 393  * compilers figure out, amounting to a mask and test.  The results are
 394  * meaningless if 'c' is not one of [A-Za-z] */
 395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 396                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 397
 398 /*
 399  * Forward declarations for pregcomp()'s friends.
 400  */
 401
 402 static const scan_data_t zero_scan_data =
 403   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 404
 405 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 406 #define SF_BEFORE_SEOL          0x0001
 407 #define SF_BEFORE_MEOL          0x0002
 408 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 409 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 410
 411 #define SF_FIX_SHIFT_EOL        (+2)
 412 #define SF_FL_SHIFT_EOL         (+4)
 413
 414 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 415 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 416
 417 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 418 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 419 #define SF_IS_INF               0x0040
 420 #define SF_HAS_PAR              0x0080
 421 #define SF_IN_PAR               0x0100
 422 #define SF_HAS_EVAL             0x0200
 423 #define SCF_DO_SUBSTR           0x0400
 424 #define SCF_DO_STCLASS_AND      0x0800
 425 #define SCF_DO_STCLASS_OR       0x1000
 426 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 427 #define SCF_WHILEM_VISITED_POS  0x2000
 428
 429 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 430 #define SCF_SEEN_ACCEPT         0x8000
 431 #define SCF_TRIE_DOING_RESTUDY 0x10000
 432
 433 #define UTF cBOOL(RExC_utf8)
 434
 435 /* The enums for all these are ordered so things work out correctly */
 436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 438                                                      == REGEX_DEPENDS_CHARSET)
 439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 441                                                      >= REGEX_UNICODE_CHARSET)
 442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 443                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 445                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 447                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 448
 449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 450
 451 /* For programs that want to be strictly Unicode compatible by dying if any
 452  * attempt is made to match a non-Unicode code point against a Unicode
 453  * property.  */
 454 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 455
 456 #define OOB_NAMEDCLASS          -1
 457
 458 /* There is no code point that is out-of-bounds, so this is problematic.  But
 459  * its only current use is to initialize a variable that is always set before
 460  * looked at. */
 461 #define OOB_UNICODE             0xDEADBEEF
 462
 463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 465
 466
 467 /* length of regex to show in messages that don't mark a position within */
 468 #define RegexLengthToShowInErrorMessages 127
 469
 470 /*
 471  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 472  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 473  * op/pragma/warn/regcomp.
 474  */
 475 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 476 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 477
 478 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 479                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 480
 481 #define REPORT_LOCATION_ARGS(offset)            \
 482                 UTF8fARG(UTF, offset, RExC_precomp), \
 483                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 487  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 488  * "...".
 489  */
 490 #define _FAIL(code) STMT_START {                                        \
 491     const char *ellipses = "";                                          \
 492     IV len = RExC_end - RExC_precomp;                                   \
 493                                                                         \
 494     if (!SIZE_ONLY)                                                     \
 495         SAVEFREESV(RExC_rx_sv);                                         \
 496     if (len > RegexLengthToShowInErrorMessages) {                       \
 497         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 498         len = RegexLengthToShowInErrorMessages - 10;                    \
 499         ellipses = "...";                                               \
 500     }                                                                   \
 501     code;                                                               \
 502 } STMT_END
 503
 504 #define FAIL(msg) _FAIL(                            \
 505     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 506             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 507
 508 #define FAIL2(msg,arg) _FAIL(                       \
 509     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 510             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 511
 512 /*
 513  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 514  */
 515 #define Simple_vFAIL(m) STMT_START {                                    \
 516     const IV offset = RExC_parse - RExC_precomp;                        \
 517     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 518             m, REPORT_LOCATION_ARGS(offset));   \
 519 } STMT_END
 520
 521 /*
 522  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 523  */
 524 #define vFAIL(m) STMT_START {                           \
 525     if (!SIZE_ONLY)                                     \
 526         SAVEFREESV(RExC_rx_sv);                         \
 527     Simple_vFAIL(m);                                    \
 528 } STMT_END
 529
 530 /*
 531  * Like Simple_vFAIL(), but accepts two arguments.
 532  */
 533 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 534     const IV offset = RExC_parse - RExC_precomp;                        \
 535     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 536                       REPORT_LOCATION_ARGS(offset));    \
 537 } STMT_END
 538
 539 /*
 540  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 541  */
 542 #define vFAIL2(m,a1) STMT_START {                       \
 543     if (!SIZE_ONLY)                                     \
 544         SAVEFREESV(RExC_rx_sv);                         \
 545     Simple_vFAIL2(m, a1);                               \
 546 } STMT_END
 547
 548
 549 /*
 550  * Like Simple_vFAIL(), but accepts three arguments.
 551  */
 552 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 553     const IV offset = RExC_parse - RExC_precomp;                \
 554     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 555             REPORT_LOCATION_ARGS(offset));      \
 556 } STMT_END
 557
 558 /*
 559  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 560  */
 561 #define vFAIL3(m,a1,a2) STMT_START {                    \
 562     if (!SIZE_ONLY)                                     \
 563         SAVEFREESV(RExC_rx_sv);                         \
 564     Simple_vFAIL3(m, a1, a2);                           \
 565 } STMT_END
 566
 567 /*
 568  * Like Simple_vFAIL(), but accepts four arguments.
 569  */
 570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 571     const IV offset = RExC_parse - RExC_precomp;                \
 572     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 573             REPORT_LOCATION_ARGS(offset));      \
 574 } STMT_END
 575
 576 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 577     if (!SIZE_ONLY)                                     \
 578         SAVEFREESV(RExC_rx_sv);                         \
 579     Simple_vFAIL4(m, a1, a2, a3);                       \
 580 } STMT_END
 581
 582 /* A specialized version of vFAIL2 that works with UTF8f */
 583 #define vFAIL2utf8f(m, a1) STMT_START { \
 584     const IV offset = RExC_parse - RExC_precomp;   \
 585     if (!SIZE_ONLY)                                \
 586         SAVEFREESV(RExC_rx_sv);                    \
 587     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 588             REPORT_LOCATION_ARGS(offset));         \
 589 } STMT_END
 590
 591
 592 /* m is not necessarily a "literal string", in this macro */
 593 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 594     const IV offset = loc - RExC_precomp;                               \
 595     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 596             m, REPORT_LOCATION_ARGS(offset));       \
 597 } STMT_END
 598
 599 #define ckWARNreg(loc,m) STMT_START {                                   \
 600     const IV offset = loc - RExC_precomp;                               \
 601     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 602             REPORT_LOCATION_ARGS(offset));              \
 603 } STMT_END
 604
 605 #define vWARN_dep(loc, m) STMT_START {                                  \
 606     const IV offset = loc - RExC_precomp;                               \
 607     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 608             REPORT_LOCATION_ARGS(offset));              \
 609 } STMT_END
 610
 611 #define ckWARNdep(loc,m) STMT_START {                                   \
 612     const IV offset = loc - RExC_precomp;                               \
 613     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 614             m REPORT_LOCATION,                                          \
 615             REPORT_LOCATION_ARGS(offset));              \
 616 } STMT_END
 617
 618 #define ckWARNregdep(loc,m) STMT_START {                                \
 619     const IV offset = loc - RExC_precomp;                               \
 620     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 621             m REPORT_LOCATION,                                          \
 622             REPORT_LOCATION_ARGS(offset));              \
 623 } STMT_END
 624
 625 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 626     const IV offset = loc - RExC_precomp;                               \
 627     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 628             m REPORT_LOCATION,                                          \
 629             a1, REPORT_LOCATION_ARGS(offset));  \
 630 } STMT_END
 631
 632 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 633     const IV offset = loc - RExC_precomp;                               \
 634     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 635             a1, REPORT_LOCATION_ARGS(offset));  \
 636 } STMT_END
 637
 638 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 639     const IV offset = loc - RExC_precomp;                               \
 640     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 641             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 642 } STMT_END
 643
 644 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 645     const IV offset = loc - RExC_precomp;                               \
 646     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 647             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 648 } STMT_END
 649
 650 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 651     const IV offset = loc - RExC_precomp;                               \
 652     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 653             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 654 } STMT_END
 655
 656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 657     const IV offset = loc - RExC_precomp;                               \
 658     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 659             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 660 } STMT_END
 661
 662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 663     const IV offset = loc - RExC_precomp;                               \
 664     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 665             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 666 } STMT_END
 667
 668
 669 /* Allow for side effects in s */
 670 #define REGC(c,s) STMT_START {                  \
 671     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 672 } STMT_END
 673
 674 /* Macros for recording node offsets.   20001227 mjd@plover.com
 675  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 676  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 677  * Element 0 holds the number n.
 678  * Position is 1 indexed.
 679  */
 680 #ifndef RE_TRACK_PATTERN_OFFSETS
 681 #define Set_Node_Offset_To_R(node,byte)
 682 #define Set_Node_Offset(node,byte)
 683 #define Set_Cur_Node_Offset
 684 #define Set_Node_Length_To_R(node,len)
 685 #define Set_Node_Length(node,len)
 686 #define Set_Node_Cur_Length(node,start)
 687 #define Node_Offset(n)
 688 #define Node_Length(n)
 689 #define Set_Node_Offset_Length(node,offset,len)
 690 #define ProgLen(ri) ri->u.proglen
 691 #define SetProgLen(ri,x) ri->u.proglen = x
 692 #else
 693 #define ProgLen(ri) ri->u.offsets[0]
 694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 695 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 696     if (! SIZE_ONLY) {                                                  \
 697         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 698                     __LINE__, (int)(node), (int)(byte)));               \
 699         if((node) < 0) {                                                \
 700             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 701                                          (int)(node));                  \
 702         } else {                                                        \
 703             RExC_offsets[2*(node)-1] = (byte);                          \
 704         }                                                               \
 705     }                                                                   \
 706 } STMT_END
 707
 708 #define Set_Node_Offset(node,byte) \
 709     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 711
 712 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 713     if (! SIZE_ONLY) {                                                  \
 714         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 715                 __LINE__, (int)(node), (int)(len)));                    \
 716         if((node) < 0) {                                                \
 717             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 718                                          (int)(node));                  \
 719         } else {                                                        \
 720             RExC_offsets[2*(node)] = (len);                             \
 721         }                                                               \
 722     }                                                                   \
 723 } STMT_END
 724
 725 #define Set_Node_Length(node,len) \
 726     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 727 #define Set_Node_Cur_Length(node, start)                \
 728     Set_Node_Length(node, RExC_parse - start)
 729
 730 /* Get offsets and lengths */
 731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 733
 734 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 735     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 736     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 737 } STMT_END
 738 #endif
 739
 740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 741 #define EXPERIMENTAL_INPLACESCAN
 742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 743
 744 #define DEBUG_RExC_seen() \
 745         DEBUG_OPTIMISE_MORE_r({                                             \
 746             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 747                                                                             \
 748             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 749                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 750                                                                             \
 751             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 752                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 753                                                                             \
 754             if (RExC_seen & REG_GPOS_SEEN)                                  \
 755                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 756                                                                             \
 757             if (RExC_seen & REG_CANY_SEEN)                                  \
 758                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 759                                                                             \
 760             if (RExC_seen & REG_RECURSE_SEEN)                               \
 761                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 762                                                                             \
 763             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 764                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 765                                                                             \
 766             if (RExC_seen & REG_VERBARG_SEEN)                               \
 767                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 768                                                                             \
 769             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 770                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 771                                                                             \
 772             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 773                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 774                                                                             \
 775             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 776                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 777                                                                             \
 778             if (RExC_seen & REG_GOSTART_SEEN)                               \
 779                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 780                                                                             \
 781             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 782                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 783                                                                             \
 784             PerlIO_printf(Perl_debug_log,"\n");                             \
 785         });
 786
 787 #define DEBUG_STUDYDATA(str,data,depth)                              \
 788 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 789     PerlIO_printf(Perl_debug_log,                                    \
 790         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 791         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 792         (int)(depth)*2, "",                                          \
 793         (IV)((data)->pos_min),                                       \
 794         (IV)((data)->pos_delta),                                     \
 795         (UV)((data)->flags),                                         \
 796         (IV)((data)->whilem_c),                                      \
 797         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 798         is_inf ? "INF " : ""                                         \
 799     );                                                               \
 800     if ((data)->last_found)                                          \
 801         PerlIO_printf(Perl_debug_log,                                \
 802             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 803             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 804             SvPVX_const((data)->last_found),                         \
 805             (IV)((data)->last_end),                                  \
 806             (IV)((data)->last_start_min),                            \
 807             (IV)((data)->last_start_max),                            \
 808             ((data)->longest &&                                      \
 809              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 810             SvPVX_const((data)->longest_fixed),                      \
 811             (IV)((data)->offset_fixed),                              \
 812             ((data)->longest &&                                      \
 813              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 814             SvPVX_const((data)->longest_float),                      \
 815             (IV)((data)->offset_float_min),                          \
 816             (IV)((data)->offset_float_max)                           \
 817         );                                                           \
 818     PerlIO_printf(Perl_debug_log,"\n");                              \
 819 });
 820
 821 /* Mark that we cannot extend a found fixed substring at this point.
 822    Update the longest found anchored substring and the longest found
 823    floating substrings if needed. */
 824
 825 STATIC void
 826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 827                     SSize_t *minlenp, int is_inf)
 828 {
 829     const STRLEN l = CHR_SVLEN(data->last_found);
 830     const STRLEN old_l = CHR_SVLEN(*data->longest);
 831     GET_RE_DEBUG_FLAGS_DECL;
 832
 833     PERL_ARGS_ASSERT_SCAN_COMMIT;
 834
 835     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 836         SvSetMagicSV(*data->longest, data->last_found);
 837         if (*data->longest == data->longest_fixed) {
 838             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 839             if (data->flags & SF_BEFORE_EOL)
 840                 data->flags
 841                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 842             else
 843                 data->flags &= ~SF_FIX_BEFORE_EOL;
 844             data->minlen_fixed=minlenp;
 845             data->lookbehind_fixed=0;
 846         }
 847         else { /* *data->longest == data->longest_float */
 848             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 849             data->offset_float_max = (l
 850                                       ? data->last_start_max
 851                                       : (data->pos_delta == SSize_t_MAX
 852                                          ? SSize_t_MAX
 853                                          : data->pos_min + data->pos_delta));
 854             if (is_inf
 855                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 856                 data->offset_float_max = SSize_t_MAX;
 857             if (data->flags & SF_BEFORE_EOL)
 858                 data->flags
 859                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 860             else
 861                 data->flags &= ~SF_FL_BEFORE_EOL;
 862             data->minlen_float=minlenp;
 863             data->lookbehind_float=0;
 864         }
 865     }
 866     SvCUR_set(data->last_found, 0);
 867     {
 868         SV * const sv = data->last_found;
 869         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 870             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 871             if (mg)
 872                 mg->mg_len = 0;
 873         }
 874     }
 875     data->last_end = -1;
 876     data->flags &= ~SF_BEFORE_EOL;
 877     DEBUG_STUDYDATA("commit: ",data,0);
 878 }
 879
 880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 881  * list that describes which code points it matches */
 882
 883 STATIC void
 884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 885 {
 886     /* Set the SSC 'ssc' to match an empty string or any code point */
 887
 888     PERL_ARGS_ASSERT_SSC_ANYTHING;
 889
 890     assert(is_ANYOF_SYNTHETIC(ssc));
 891
 892     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 893     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 894     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 895 }
 896
 897 STATIC int
 898 S_ssc_is_anything(const regnode_ssc *ssc)
 899 {
 900     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 901      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 902      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 903      * in any way, so there's no point in using it */
 904
 905     UV start, end;
 906     bool ret;
 907
 908     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 909
 910     assert(is_ANYOF_SYNTHETIC(ssc));
 911
 912     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 913         return FALSE;
 914     }
 915
 916     /* See if the list consists solely of the range 0 - Infinity */
 917     invlist_iterinit(ssc->invlist);
 918     ret = invlist_iternext(ssc->invlist, &start, &end)
 919           && start == 0
 920           && end == UV_MAX;
 921
 922     invlist_iterfinish(ssc->invlist);
 923
 924     if (ret) {
 925         return TRUE;
 926     }
 927
 928     /* If e.g., both \w and \W are set, matches everything */
 929     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 930         int i;
 931         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 932             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 933                 return TRUE;
 934             }
 935         }
 936     }
 937
 938     return FALSE;
 939 }
 940
 941 STATIC void
 942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 943 {
 944     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 945      * string, any code point, or any posix class under locale */
 946
 947     PERL_ARGS_ASSERT_SSC_INIT;
 948
 949     Zero(ssc, 1, regnode_ssc);
 950     set_ANYOF_SYNTHETIC(ssc);
 951     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 952     ssc_anything(ssc);
 953
 954     /* If any portion of the regex is to operate under locale rules,
 955      * initialization includes it.  The reason this isn't done for all regexes
 956      * is that the optimizer was written under the assumption that locale was
 957      * all-or-nothing.  Given the complexity and lack of documentation in the
 958      * optimizer, and that there are inadequate test cases for locale, many
 959      * parts of it may not work properly, it is safest to avoid locale unless
 960      * necessary. */
 961     if (RExC_contains_locale) {
 962         ANYOF_POSIXL_SETALL(ssc);
 963     }
 964     else {
 965         ANYOF_POSIXL_ZERO(ssc);
 966     }
 967 }
 968
 969 STATIC int
 970 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
 971                         const regnode_ssc *ssc)
 972 {
 973     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 974      * to the list of code points matched, and locale posix classes; hence does
 975      * not check its flags) */
 976
 977     UV start, end;
 978     bool ret;
 979
 980     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 981
 982     assert(is_ANYOF_SYNTHETIC(ssc));
 983
 984     invlist_iterinit(ssc->invlist);
 985     ret = invlist_iternext(ssc->invlist, &start, &end)
 986           && start == 0
 987           && end == UV_MAX;
 988
 989     invlist_iterfinish(ssc->invlist);
 990
 991     if (! ret) {
 992         return FALSE;
 993     }
 994
 995     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 996         return FALSE;
 997     }
 998
 999     return TRUE;
1000 }
1001
1002 STATIC SV*
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004                                const regnode_charclass* const node)
1005 {
1006     /* Returns a mortal inversion list defining which code points are matched
1007      * by 'node', which is of type ANYOF.  Handles complementing the result if
1008      * appropriate.  If some code points aren't knowable at this time, the
1009      * returned list must, and will, contain every code point that is a
1010      * possibility. */
1011
1012     SV* invlist = sv_2mortal(_new_invlist(0));
1013     SV* only_utf8_locale_invlist = NULL;
1014     unsigned int i;
1015     const U32 n = ARG(node);
1016     bool new_node_has_latin1 = FALSE;
1017
1018     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1019
1020     /* Look at the data structure created by S_set_ANYOF_arg() */
1021     if (n != ANYOF_NONBITMAP_EMPTY) {
1022         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023         AV * const av = MUTABLE_AV(SvRV(rv));
1024         SV **const ary = AvARRAY(av);
1025         assert(RExC_rxi->data->what[n] == 's');
1026
1027         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1029         }
1030         else if (ary[0] && ary[0] != &PL_sv_undef) {
1031
1032             /* Here, no compile-time swash, and there are things that won't be
1033              * known until runtime -- we have to assume it could be anything */
1034             return _add_range_to_invlist(invlist, 0, UV_MAX);
1035         }
1036         else if (ary[3] && ary[3] != &PL_sv_undef) {
1037
1038             /* Here no compile-time swash, and no run-time only data.  Use the
1039              * node's inversion list */
1040             invlist = sv_2mortal(invlist_clone(ary[3]));
1041         }
1042
1043         /* Get the code points valid only under UTF-8 locales */
1044         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045             && ary[2] && ary[2] != &PL_sv_undef)
1046         {
1047             only_utf8_locale_invlist = ary[2];
1048         }
1049     }
1050
1051     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052      * inversion list for the others, but if there are code points that should
1053      * match only conditionally on the target string being UTF-8, those are
1054      * placed in the inversion list, and not the bitmap.  Since there are
1055      * circumstances under which they could match, they are included in the
1056      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1057      * here, so that when we invert below, the end result actually does include
1058      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1059      * before we add the unconditionally matched code points */
1060     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061         _invlist_intersection_complement_2nd(invlist,
1062                                              PL_UpperLatin1,
1063                                              &invlist);
1064     }
1065
1066     /* Add in the points from the bit map */
1067     for (i = 0; i < 256; i++) {
1068         if (ANYOF_BITMAP_TEST(node, i)) {
1069             invlist = add_cp_to_invlist(invlist, i);
1070             new_node_has_latin1 = TRUE;
1071         }
1072     }
1073
1074     /* If this can match all upper Latin1 code points, have to add them
1075      * as well */
1076     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1078     }
1079
1080     /* Similarly for these */
1081     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1083     }
1084
1085     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086         _invlist_invert(invlist);
1087     }
1088     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1089
1090         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091          * locale.  We can skip this if there are no 0-255 at all. */
1092         _invlist_union(invlist, PL_Latin1, &invlist);
1093     }
1094
1095     /* Similarly add the UTF-8 locale possible matches.  These have to be
1096      * deferred until after the non-UTF-8 locale ones are taken care of just
1097      * above, or it leads to wrong results under ANYOF_INVERT */
1098     if (only_utf8_locale_invlist) {
1099         _invlist_union_maybe_complement_2nd(invlist,
1100                                             only_utf8_locale_invlist,
1101                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1102                                             &invlist);
1103     }
1104
1105     return invlist;
1106 }
1107
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero           ssc_init
1110
1111 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1113
1114 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1115  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1116  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1117
1118 STATIC void
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120                 const regnode_charclass *and_with)
1121 {
1122     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123      * another SSC or a regular ANYOF class.  Can create false positives. */
1124
1125     SV* anded_cp_list;
1126     U8  anded_flags;
1127
1128     PERL_ARGS_ASSERT_SSC_AND;
1129
1130     assert(is_ANYOF_SYNTHETIC(ssc));
1131
1132     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133      * the code point inversion list and just the relevant flags */
1134     if (is_ANYOF_SYNTHETIC(and_with)) {
1135         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136         anded_flags = ANYOF_FLAGS(and_with);
1137
1138         /* XXX This is a kludge around what appears to be deficiencies in the
1139          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1140          * there are paths through the optimizer where it doesn't get weeded
1141          * out when it should.  And if we don't make some extra provision for
1142          * it like the code just below, it doesn't get added when it should.
1143          * This solution is to add it only when AND'ing, which is here, and
1144          * only when what is being AND'ed is the pristine, original node
1145          * matching anything.  Thus it is like adding it to ssc_anything() but
1146          * only when the result is to be AND'ed.  Probably the same solution
1147          * could be adopted for the same problem we have with /l matching,
1148          * which is solved differently in S_ssc_init(), and that would lead to
1149          * fewer false positives than that solution has.  But if this solution
1150          * creates bugs, the consequences are only that a warning isn't raised
1151          * that should be; while the consequences for having /l bugs is
1152          * incorrect matches */
1153         if (ssc_is_anything((regnode_ssc *)and_with)) {
1154             anded_flags |= ANYOF_WARN_SUPER;
1155         }
1156     }
1157     else {
1158         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1160     }
1161
1162     ANYOF_FLAGS(ssc) &= anded_flags;
1163
1164     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166      * 'and_with' may be inverted.  When not inverted, we have the situation of
1167      * computing:
1168      *  (C1 | P1) & (C2 | P2)
1169      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1172      *                    <=  ((C1 & C2) | P1 | P2)
1173      * Alternatively, the last few steps could be:
1174      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1176      *                    <=  (C1 | C2 | (P1 & P2))
1177      * We favor the second approach if either P1 or P2 is non-empty.  This is
1178      * because these components are a barrier to doing optimizations, as what
1179      * they match cannot be known until the moment of matching as they are
1180      * dependent on the current locale, 'AND"ing them likely will reduce or
1181      * eliminate them.
1182      * But we can do better if we know that C1,P1 are in their initial state (a
1183      * frequent occurrence), each matching everything:
1184      *  (<everything>) & (C2 | P2) =  C2 | P2
1185      * Similarly, if C2,P2 are in their initial state (again a frequent
1186      * occurrence), the result is a no-op
1187      *  (C1 | P1) & (<everything>) =  C1 | P1
1188      *
1189      * Inverted, we have
1190      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1191      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1193      * */
1194
1195     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196         && ! is_ANYOF_SYNTHETIC(and_with))
1197     {
1198         unsigned int i;
1199
1200         ssc_intersection(ssc,
1201                          anded_cp_list,
1202                          FALSE /* Has already been inverted */
1203                          );
1204
1205         /* If either P1 or P2 is empty, the intersection will be also; can skip
1206          * the loop */
1207         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208             ANYOF_POSIXL_ZERO(ssc);
1209         }
1210         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1211
1212             /* Note that the Posix class component P from 'and_with' actually
1213              * looks like:
1214              *      P = Pa | Pb | ... | Pn
1215              * where each component is one posix class, such as in [\w\s].
1216              * Thus
1217              *      ~P = ~(Pa | Pb | ... | Pn)
1218              *         = ~Pa & ~Pb & ... & ~Pn
1219              *        <= ~Pa | ~Pb | ... | ~Pn
1220              * The last is something we can easily calculate, but unfortunately
1221              * is likely to have many false positives.  We could do better
1222              * in some (but certainly not all) instances if two classes in
1223              * P have known relationships.  For example
1224              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1225              * So
1226              *      :lower: & :print: = :lower:
1227              * And similarly for classes that must be disjoint.  For example,
1228              * since \s and \w can have no elements in common based on rules in
1229              * the POSIX standard,
1230              *      \w & ^\S = nothing
1231              * Unfortunately, some vendor locales do not meet the Posix
1232              * standard, in particular almost everything by Microsoft.
1233              * The loop below just changes e.g., \w into \W and vice versa */
1234
1235             regnode_charclass_posixl temp;
1236             int add = 1;    /* To calculate the index of the complement */
1237
1238             ANYOF_POSIXL_ZERO(&temp);
1239             for (i = 0; i < ANYOF_MAX; i++) {
1240                 assert(i % 2 != 0
1241                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1243
1244                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245                     ANYOF_POSIXL_SET(&temp, i + add);
1246                 }
1247                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1248             }
1249             ANYOF_POSIXL_AND(&temp, ssc);
1250
1251         } /* else ssc already has no posixes */
1252     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1253          in its initial state */
1254     else if (! is_ANYOF_SYNTHETIC(and_with)
1255              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1256     {
1257         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258          * copy it over 'ssc' */
1259         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260             if (is_ANYOF_SYNTHETIC(and_with)) {
1261                 StructCopy(and_with, ssc, regnode_ssc);
1262             }
1263             else {
1264                 ssc->invlist = anded_cp_list;
1265                 ANYOF_POSIXL_ZERO(ssc);
1266                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1268                 }
1269             }
1270         }
1271         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1273         {
1274             /* One or the other of P1, P2 is non-empty. */
1275             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1277             }
1278             ssc_union(ssc, anded_cp_list, FALSE);
1279         }
1280         else { /* P1 = P2 = empty */
1281             ssc_intersection(ssc, anded_cp_list, FALSE);
1282         }
1283     }
1284 }
1285
1286 STATIC void
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288                const regnode_charclass *or_with)
1289 {
1290     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291      * another SSC or a regular ANYOF class.  Can create false positives if
1292      * 'or_with' is to be inverted. */
1293
1294     SV* ored_cp_list;
1295     U8 ored_flags;
1296
1297     PERL_ARGS_ASSERT_SSC_OR;
1298
1299     assert(is_ANYOF_SYNTHETIC(ssc));
1300
1301     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302      * the code point inversion list and just the relevant flags */
1303     if (is_ANYOF_SYNTHETIC(or_with)) {
1304         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305         ored_flags = ANYOF_FLAGS(or_with);
1306     }
1307     else {
1308         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1310     }
1311
1312     ANYOF_FLAGS(ssc) |= ored_flags;
1313
1314     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316      * 'or_with' may be inverted.  When not inverted, we have the simple
1317      * situation of computing:
1318      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1319      * If P1|P2 yields a situation with both a class and its complement are
1320      * set, like having both \w and \W, this matches all code points, and we
1321      * can delete these from the P component of the ssc going forward.  XXX We
1322      * might be able to delete all the P components, but I (khw) am not certain
1323      * about this, and it is better to be safe.
1324      *
1325      * Inverted, we have
1326      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1327      *                         <=  (C1 | P1) | ~C2
1328      *                         <=  (C1 | ~C2) | P1
1329      * (which results in actually simpler code than the non-inverted case)
1330      * */
1331
1332     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333         && ! is_ANYOF_SYNTHETIC(or_with))
1334     {
1335         /* We ignore P2, leaving P1 going forward */
1336     }   /* else  Not inverted */
1337     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1340             unsigned int i;
1341             for (i = 0; i < ANYOF_MAX; i += 2) {
1342                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1343                 {
1344                     ssc_match_all_cp(ssc);
1345                     ANYOF_POSIXL_CLEAR(ssc, i);
1346                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1347                 }
1348             }
1349         }
1350     }
1351
1352     ssc_union(ssc,
1353               ored_cp_list,
1354               FALSE /* Already has been inverted */
1355               );
1356 }
1357
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1360 {
1361     PERL_ARGS_ASSERT_SSC_UNION;
1362
1363     assert(is_ANYOF_SYNTHETIC(ssc));
1364
1365     _invlist_union_maybe_complement_2nd(ssc->invlist,
1366                                         invlist,
1367                                         invert2nd,
1368                                         &ssc->invlist);
1369 }
1370
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1373                          SV* const invlist,
1374                          const bool invert2nd)
1375 {
1376     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1377
1378     assert(is_ANYOF_SYNTHETIC(ssc));
1379
1380     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1381                                                invlist,
1382                                                invert2nd,
1383                                                &ssc->invlist);
1384 }
1385
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1388 {
1389     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1390
1391     assert(is_ANYOF_SYNTHETIC(ssc));
1392
1393     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1394 }
1395
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1398 {
1399     /* AND just the single code point 'cp' into the SSC 'ssc' */
1400
1401     SV* cp_list = _new_invlist(2);
1402
1403     PERL_ARGS_ASSERT_SSC_CP_AND;
1404
1405     assert(is_ANYOF_SYNTHETIC(ssc));
1406
1407     cp_list = add_cp_to_invlist(cp_list, cp);
1408     ssc_intersection(ssc, cp_list,
1409                      FALSE /* Not inverted */
1410                      );
1411     SvREFCNT_dec_NN(cp_list);
1412 }
1413
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(regnode_ssc *ssc)
1416 {
1417     /* Set the SSC 'ssc' to not match any locale things */
1418     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1419
1420     assert(is_ANYOF_SYNTHETIC(ssc));
1421
1422     ANYOF_POSIXL_ZERO(ssc);
1423     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1424 }
1425
1426 STATIC void
1427 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1428 {
1429     /* The inversion list in the SSC is marked mortal; now we need a more
1430      * permanent copy, which is stored the same way that is done in a regular
1431      * ANYOF node, with the first 256 code points in a bit map */
1432
1433     SV* invlist = invlist_clone(ssc->invlist);
1434
1435     PERL_ARGS_ASSERT_SSC_FINALIZE;
1436
1437     assert(is_ANYOF_SYNTHETIC(ssc));
1438
1439     /* The code in this file assumes that all but these flags aren't relevant
1440      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1441      * time we reach here */
1442     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1443
1444     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1445
1446     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1447                                 NULL, NULL, NULL, FALSE);
1448
1449     /* Make sure is clone-safe */
1450     ssc->invlist = NULL;
1451
1452     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1453         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1454     }
1455
1456     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1457 }
1458
1459 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1460 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1461 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1462 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1463                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1464                                : 0 )
1465
1466
1467 #ifdef DEBUGGING
1468 /*
1469    dump_trie(trie,widecharmap,revcharmap)
1470    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1471    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1472
1473    These routines dump out a trie in a somewhat readable format.
1474    The _interim_ variants are used for debugging the interim
1475    tables that are used to generate the final compressed
1476    representation which is what dump_trie expects.
1477
1478    Part of the reason for their existence is to provide a form
1479    of documentation as to how the different representations function.
1480
1481 */
1482
1483 /*
1484   Dumps the final compressed table form of the trie to Perl_debug_log.
1485   Used for debugging make_trie().
1486 */
1487
1488 STATIC void
1489 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1490             AV *revcharmap, U32 depth)
1491 {
1492     U32 state;
1493     SV *sv=sv_newmortal();
1494     int colwidth= widecharmap ? 6 : 4;
1495     U16 word;
1496     GET_RE_DEBUG_FLAGS_DECL;
1497
1498     PERL_ARGS_ASSERT_DUMP_TRIE;
1499
1500     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1501         (int)depth * 2 + 2,"",
1502         "Match","Base","Ofs" );
1503
1504     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1505         SV ** const tmp = av_fetch( revcharmap, state, 0);
1506         if ( tmp ) {
1507             PerlIO_printf( Perl_debug_log, "%*s",
1508                 colwidth,
1509                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1510                             PL_colors[0], PL_colors[1],
1511                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1512                             PERL_PV_ESCAPE_FIRSTCHAR
1513                 )
1514             );
1515         }
1516     }
1517     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1518         (int)depth * 2 + 2,"");
1519
1520     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1521         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1522     PerlIO_printf( Perl_debug_log, "\n");
1523
1524     for( state = 1 ; state < trie->statecount ; state++ ) {
1525         const U32 base = trie->states[ state ].trans.base;
1526
1527         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1528                                        (int)depth * 2 + 2,"", (UV)state);
1529
1530         if ( trie->states[ state ].wordnum ) {
1531             PerlIO_printf( Perl_debug_log, " W%4X",
1532                                            trie->states[ state ].wordnum );
1533         } else {
1534             PerlIO_printf( Perl_debug_log, "%6s", "" );
1535         }
1536
1537         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1538
1539         if ( base ) {
1540             U32 ofs = 0;
1541
1542             while( ( base + ofs  < trie->uniquecharcount ) ||
1543                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1544                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1545                                                                     != state))
1546                     ofs++;
1547
1548             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1549
1550             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1551                 if ( ( base + ofs >= trie->uniquecharcount )
1552                         && ( base + ofs - trie->uniquecharcount
1553                                                         < trie->lasttrans )
1554                         && trie->trans[ base + ofs
1555                                     - trie->uniquecharcount ].check == state )
1556                 {
1557                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1558                     colwidth,
1559                     (UV)trie->trans[ base + ofs
1560                                              - trie->uniquecharcount ].next );
1561                 } else {
1562                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1563                 }
1564             }
1565
1566             PerlIO_printf( Perl_debug_log, "]");
1567
1568         }
1569         PerlIO_printf( Perl_debug_log, "\n" );
1570     }
1571     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1572                                 (int)depth*2, "");
1573     for (word=1; word <= trie->wordcount; word++) {
1574         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1575             (int)word, (int)(trie->wordinfo[word].prev),
1576             (int)(trie->wordinfo[word].len));
1577     }
1578     PerlIO_printf(Perl_debug_log, "\n" );
1579 }
1580 /*
1581   Dumps a fully constructed but uncompressed trie in list form.
1582   List tries normally only are used for construction when the number of
1583   possible chars (trie->uniquecharcount) is very high.
1584   Used for debugging make_trie().
1585 */
1586 STATIC void
1587 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1588                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1589                          U32 depth)
1590 {
1591     U32 state;
1592     SV *sv=sv_newmortal();
1593     int colwidth= widecharmap ? 6 : 4;
1594     GET_RE_DEBUG_FLAGS_DECL;
1595
1596     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1597
1598     /* print out the table precompression.  */
1599     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1600         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1601         "------:-----+-----------------\n" );
1602
1603     for( state=1 ; state < next_alloc ; state ++ ) {
1604         U16 charid;
1605
1606         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1607             (int)depth * 2 + 2,"", (UV)state  );
1608         if ( ! trie->states[ state ].wordnum ) {
1609             PerlIO_printf( Perl_debug_log, "%5s| ","");
1610         } else {
1611             PerlIO_printf( Perl_debug_log, "W%4x| ",
1612                 trie->states[ state ].wordnum
1613             );
1614         }
1615         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1616             SV ** const tmp = av_fetch( revcharmap,
1617                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1618             if ( tmp ) {
1619                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1620                     colwidth,
1621                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1622                               colwidth,
1623                               PL_colors[0], PL_colors[1],
1624                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1625                               | PERL_PV_ESCAPE_FIRSTCHAR
1626                     ) ,
1627                     TRIE_LIST_ITEM(state,charid).forid,
1628                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1629                 );
1630                 if (!(charid % 10))
1631                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1632                         (int)((depth * 2) + 14), "");
1633             }
1634         }
1635         PerlIO_printf( Perl_debug_log, "\n");
1636     }
1637 }
1638
1639 /*
1640   Dumps a fully constructed but uncompressed trie in table form.
1641   This is the normal DFA style state transition table, with a few
1642   twists to facilitate compression later.
1643   Used for debugging make_trie().
1644 */
1645 STATIC void
1646 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1647                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1648                           U32 depth)
1649 {
1650     U32 state;
1651     U16 charid;
1652     SV *sv=sv_newmortal();
1653     int colwidth= widecharmap ? 6 : 4;
1654     GET_RE_DEBUG_FLAGS_DECL;
1655
1656     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1657
1658     /*
1659        print out the table precompression so that we can do a visual check
1660        that they are identical.
1661      */
1662
1663     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1664
1665     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1666         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1667         if ( tmp ) {
1668             PerlIO_printf( Perl_debug_log, "%*s",
1669                 colwidth,
1670                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1671                             PL_colors[0], PL_colors[1],
1672                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1673                             PERL_PV_ESCAPE_FIRSTCHAR
1674                 )
1675             );
1676         }
1677     }
1678
1679     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1680
1681     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1682         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1683     }
1684
1685     PerlIO_printf( Perl_debug_log, "\n" );
1686
1687     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1688
1689         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1690             (int)depth * 2 + 2,"",
1691             (UV)TRIE_NODENUM( state ) );
1692
1693         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1694             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1695             if (v)
1696                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1697             else
1698                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1699         }
1700         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1701             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1702                                             (UV)trie->trans[ state ].check );
1703         } else {
1704             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1705                                             (UV)trie->trans[ state ].check,
1706             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1707         }
1708     }
1709 }
1710
1711 #endif
1712
1713
1714 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1715   startbranch: the first branch in the whole branch sequence
1716   first      : start branch of sequence of branch-exact nodes.
1717                May be the same as startbranch
1718   last       : Thing following the last branch.
1719                May be the same as tail.
1720   tail       : item following the branch sequence
1721   count      : words in the sequence
1722   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1723   depth      : indent depth
1724
1725 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1726
1727 A trie is an N'ary tree where the branches are determined by digital
1728 decomposition of the key. IE, at the root node you look up the 1st character and
1729 follow that branch repeat until you find the end of the branches. Nodes can be
1730 marked as "accepting" meaning they represent a complete word. Eg:
1731
1732   /he|she|his|hers/
1733
1734 would convert into the following structure. Numbers represent states, letters
1735 following numbers represent valid transitions on the letter from that state, if
1736 the number is in square brackets it represents an accepting state, otherwise it
1737 will be in parenthesis.
1738
1739       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1740       |    |
1741       |   (2)
1742       |    |
1743      (1)   +-i->(6)-+-s->[7]
1744       |
1745       +-s->(3)-+-h->(4)-+-e->[5]
1746
1747       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1748
1749 This shows that when matching against the string 'hers' we will begin at state 1
1750 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1751 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1752 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1753 single traverse. We store a mapping from accepting to state to which word was
1754 matched, and then when we have multiple possibilities we try to complete the
1755 rest of the regex in the order in which they occured in the alternation.
1756
1757 The only prior NFA like behaviour that would be changed by the TRIE support is
1758 the silent ignoring of duplicate alternations which are of the form:
1759
1760  / (DUPE|DUPE) X? (?{ ... }) Y /x
1761
1762 Thus EVAL blocks following a trie may be called a different number of times with
1763 and without the optimisation. With the optimisations dupes will be silently
1764 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1765 the following demonstrates:
1766
1767  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1768
1769 which prints out 'word' three times, but
1770
1771  'words'=~/(word|word|word)(?{ print $1 })S/
1772
1773 which doesnt print it out at all. This is due to other optimisations kicking in.
1774
1775 Example of what happens on a structural level:
1776
1777 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1778
1779    1: CURLYM[1] {1,32767}(18)
1780    5:   BRANCH(8)
1781    6:     EXACT <ac>(16)
1782    8:   BRANCH(11)
1783    9:     EXACT <ad>(16)
1784   11:   BRANCH(14)
1785   12:     EXACT <ab>(16)
1786   16:   SUCCEED(0)
1787   17:   NOTHING(18)
1788   18: END(0)
1789
1790 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1791 and should turn into:
1792
1793    1: CURLYM[1] {1,32767}(18)
1794    5:   TRIE(16)
1795         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1796           <ac>
1797           <ad>
1798           <ab>
1799   16:   SUCCEED(0)
1800   17:   NOTHING(18)
1801   18: END(0)
1802
1803 Cases where tail != last would be like /(?foo|bar)baz/:
1804
1805    1: BRANCH(4)
1806    2:   EXACT <foo>(8)
1807    4: BRANCH(7)
1808    5:   EXACT <bar>(8)
1809    7: TAIL(8)
1810    8: EXACT <baz>(10)
1811   10: END(0)
1812
1813 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1814 and would end up looking like:
1815
1816     1: TRIE(8)
1817       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1818         <foo>
1819         <bar>
1820    7: TAIL(8)
1821    8: EXACT <baz>(10)
1822   10: END(0)
1823
1824     d = uvchr_to_utf8_flags(d, uv, 0);
1825
1826 is the recommended Unicode-aware way of saying
1827
1828     *(d++) = uv;
1829 */
1830
1831 #define TRIE_STORE_REVCHAR(val)                                            \
1832     STMT_START {                                                           \
1833         if (UTF) {                                                         \
1834             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1835             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1836             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1837             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1838             SvPOK_on(zlopp);                                               \
1839             SvUTF8_on(zlopp);                                              \
1840             av_push(revcharmap, zlopp);                                    \
1841         } else {                                                           \
1842             char ooooff = (char)val;                                           \
1843             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1844         }                                                                  \
1845         } STMT_END
1846
1847 /* This gets the next character from the input, folding it if not already
1848  * folded. */
1849 #define TRIE_READ_CHAR STMT_START {                                           \
1850     wordlen++;                                                                \
1851     if ( UTF ) {                                                              \
1852         /* if it is UTF then it is either already folded, or does not need    \
1853          * folding */                                                         \
1854         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1855     }                                                                         \
1856     else if (folder == PL_fold_latin1) {                                      \
1857         /* This folder implies Unicode rules, which in the range expressible  \
1858          *  by not UTF is the lower case, with the two exceptions, one of     \
1859          *  which should have been taken care of before calling this */       \
1860         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1861         uvc = toLOWER_L1(*uc);                                                \
1862         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1863         len = 1;                                                              \
1864     } else {                                                                  \
1865         /* raw data, will be folded later if needed */                        \
1866         uvc = (U32)*uc;                                                       \
1867         len = 1;                                                              \
1868     }                                                                         \
1869 } STMT_END
1870
1871
1872
1873 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1874     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1875         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1876         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1877     }                                                           \
1878     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1879     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1880     TRIE_LIST_CUR( state )++;                                   \
1881 } STMT_END
1882
1883 #define TRIE_LIST_NEW(state) STMT_START {                       \
1884     Newxz( trie->states[ state ].trans.list,               \
1885         4, reg_trie_trans_le );                                 \
1886      TRIE_LIST_CUR( state ) = 1;                                \
1887      TRIE_LIST_LEN( state ) = 4;                                \
1888 } STMT_END
1889
1890 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1891     U16 dupe= trie->states[ state ].wordnum;                    \
1892     regnode * const noper_next = regnext( noper );              \
1893                                                                 \
1894     DEBUG_r({                                                   \
1895         /* store the word for dumping */                        \
1896         SV* tmp;                                                \
1897         if (OP(noper) != NOTHING)                               \
1898             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1899         else                                                    \
1900             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1901         av_push( trie_words, tmp );                             \
1902     });                                                         \
1903                                                                 \
1904     curword++;                                                  \
1905     trie->wordinfo[curword].prev   = 0;                         \
1906     trie->wordinfo[curword].len    = wordlen;                   \
1907     trie->wordinfo[curword].accept = state;                     \
1908                                                                 \
1909     if ( noper_next < tail ) {                                  \
1910         if (!trie->jump)                                        \
1911             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1912                                                  sizeof(U16) ); \
1913         trie->jump[curword] = (U16)(noper_next - convert);      \
1914         if (!jumper)                                            \
1915             jumper = noper_next;                                \
1916         if (!nextbranch)                                        \
1917             nextbranch= regnext(cur);                           \
1918     }                                                           \
1919                                                                 \
1920     if ( dupe ) {                                               \
1921         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1922         /* chain, so that when the bits of chain are later    */\
1923         /* linked together, the dups appear in the chain      */\
1924         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1925         trie->wordinfo[dupe].prev = curword;                    \
1926     } else {                                                    \
1927         /* we haven't inserted this word yet.                */ \
1928         trie->states[ state ].wordnum = curword;                \
1929     }                                                           \
1930 } STMT_END
1931
1932
1933 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1934      ( ( base + charid >=  ucharcount                                   \
1935          && base + charid < ubound                                      \
1936          && state == trie->trans[ base - ucharcount + charid ].check    \
1937          && trie->trans[ base - ucharcount + charid ].next )            \
1938            ? trie->trans[ base - ucharcount + charid ].next             \
1939            : ( state==1 ? special : 0 )                                 \
1940       )
1941
1942 #define MADE_TRIE       1
1943 #define MADE_JUMP_TRIE  2
1944 #define MADE_EXACT_TRIE 4
1945
1946 STATIC I32
1947 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1948                   regnode *first, regnode *last, regnode *tail,
1949                   U32 word_count, U32 flags, U32 depth)
1950 {
1951     /* first pass, loop through and scan words */
1952     reg_trie_data *trie;
1953     HV *widecharmap = NULL;
1954     AV *revcharmap = newAV();
1955     regnode *cur;
1956     STRLEN len = 0;
1957     UV uvc = 0;
1958     U16 curword = 0;
1959     U32 next_alloc = 0;
1960     regnode *jumper = NULL;
1961     regnode *nextbranch = NULL;
1962     regnode *convert = NULL;
1963     U32 *prev_states; /* temp array mapping each state to previous one */
1964     /* we just use folder as a flag in utf8 */
1965     const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969     AV *trie_words = NULL;
1970     /* along with revcharmap, this only used during construction but both are
1971      * useful during debugging so we store them in the struct when debugging.
1972      */
1973 #else
1974     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975     STRLEN trie_charcount=0;
1976 #endif
1977     SV *re_trie_maxbuff;
1978     GET_RE_DEBUG_FLAGS_DECL;
1979
1980     PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982     PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985     switch (flags) {
1986         case EXACT: break;
1987         case EXACTFA:
1988         case EXACTFU_SS:
1989         case EXACTFU: folder = PL_fold_latin1; break;
1990         case EXACTF:  folder = PL_fold; break;
1991         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992     }
1993
1994     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995     trie->refcount = 1;
1996     trie->startstate = 1;
1997     trie->wordcount = word_count;
1998     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000     if (flags == EXACT)
2001         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005     DEBUG_r({
2006         trie_words = newAV();
2007     });
2008
2009     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010     assert(re_trie_maxbuff);
2011     if (!SvIOK(re_trie_maxbuff)) {
2012         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2013     }
2014     DEBUG_TRIE_COMPILE_r({
2015         PerlIO_printf( Perl_debug_log,
2016           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2017           (int)depth * 2 + 2, "",
2018           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2019           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2020     });
2021
2022    /* Find the node we are going to overwrite */
2023     if ( first == startbranch && OP( last ) != BRANCH ) {
2024         /* whole branch chain */
2025         convert = first;
2026     } else {
2027         /* branch sub-chain */
2028         convert = NEXTOPER( first );
2029     }
2030
2031     /*  -- First loop and Setup --
2032
2033        We first traverse the branches and scan each word to determine if it
2034        contains widechars, and how many unique chars there are, this is
2035        important as we have to build a table with at least as many columns as we
2036        have unique chars.
2037
2038        We use an array of integers to represent the character codes 0..255
2039        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2040        the native representation of the character value as the key and IV's for
2041        the coded index.
2042
2043        *TODO* If we keep track of how many times each character is used we can
2044        remap the columns so that the table compression later on is more
2045        efficient in terms of memory by ensuring the most common value is in the
2046        middle and the least common are on the outside.  IMO this would be better
2047        than a most to least common mapping as theres a decent chance the most
2048        common letter will share a node with the least common, meaning the node
2049        will not be compressible. With a middle is most common approach the worst
2050        case is when we have the least common nodes twice.
2051
2052      */
2053
2054     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2055         regnode *noper = NEXTOPER( cur );
2056         const U8 *uc = (U8*)STRING( noper );
2057         const U8 *e  = uc + STR_LEN( noper );
2058         int foldlen = 0;
2059         U32 wordlen      = 0;         /* required init */
2060         STRLEN minchars = 0;
2061         STRLEN maxchars = 0;
2062         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2063                                                bitmap?*/
2064
2065         if (OP(noper) == NOTHING) {
2066             regnode *noper_next= regnext(noper);
2067             if (noper_next != tail && OP(noper_next) == flags) {
2068                 noper = noper_next;
2069                 uc= (U8*)STRING(noper);
2070                 e= uc + STR_LEN(noper);
2071                 trie->minlen= STR_LEN(noper);
2072             } else {
2073                 trie->minlen= 0;
2074                 continue;
2075             }
2076         }
2077
2078         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2079             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2080                                           regardless of encoding */
2081             if (OP( noper ) == EXACTFU_SS) {
2082                 /* false positives are ok, so just set this */
2083                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2084             }
2085         }
2086         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2087                                            branch */
2088             TRIE_CHARCOUNT(trie)++;
2089             TRIE_READ_CHAR;
2090
2091             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2092              * is in effect.  Under /i, this character can match itself, or
2093              * anything that folds to it.  If not under /i, it can match just
2094              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2095              * all fold to k, and all are single characters.   But some folds
2096              * expand to more than one character, so for example LATIN SMALL
2097              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2098              * the string beginning at 'uc' is 'ffi', it could be matched by
2099              * three characters, or just by the one ligature character. (It
2100              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2101              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2102              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2103              * match.)  The trie needs to know the minimum and maximum number
2104              * of characters that could match so that it can use size alone to
2105              * quickly reject many match attempts.  The max is simple: it is
2106              * the number of folded characters in this branch (since a fold is
2107              * never shorter than what folds to it. */
2108
2109             maxchars++;
2110
2111             /* And the min is equal to the max if not under /i (indicated by
2112              * 'folder' being NULL), or there are no multi-character folds.  If
2113              * there is a multi-character fold, the min is incremented just
2114              * once, for the character that folds to the sequence.  Each
2115              * character in the sequence needs to be added to the list below of
2116              * characters in the trie, but we count only the first towards the
2117              * min number of characters needed.  This is done through the
2118              * variable 'foldlen', which is returned by the macros that look
2119              * for these sequences as the number of bytes the sequence
2120              * occupies.  Each time through the loop, we decrement 'foldlen' by
2121              * how many bytes the current char occupies.  Only when it reaches
2122              * 0 do we increment 'minchars' or look for another multi-character
2123              * sequence. */
2124             if (folder == NULL) {
2125                 minchars++;
2126             }
2127             else if (foldlen > 0) {
2128                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2129             }
2130             else {
2131                 minchars++;
2132
2133                 /* See if *uc is the beginning of a multi-character fold.  If
2134                  * so, we decrement the length remaining to look at, to account
2135                  * for the current character this iteration.  (We can use 'uc'
2136                  * instead of the fold returned by TRIE_READ_CHAR because for
2137                  * non-UTF, the latin1_safe macro is smart enough to account
2138                  * for all the unfolded characters, and because for UTF, the
2139                  * string will already have been folded earlier in the
2140                  * compilation process */
2141                 if (UTF) {
2142                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2143                         foldlen -= UTF8SKIP(uc);
2144                     }
2145                 }
2146                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2147                     foldlen--;
2148                 }
2149             }
2150
2151             /* The current character (and any potential folds) should be added
2152              * to the possible matching characters for this position in this
2153              * branch */
2154             if ( uvc < 256 ) {
2155                 if ( folder ) {
2156                     U8 folded= folder[ (U8) uvc ];
2157                     if ( !trie->charmap[ folded ] ) {
2158                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2159                         TRIE_STORE_REVCHAR( folded );
2160                     }
2161                 }
2162                 if ( !trie->charmap[ uvc ] ) {
2163                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2164                     TRIE_STORE_REVCHAR( uvc );
2165                 }
2166                 if ( set_bit ) {
2167                     /* store the codepoint in the bitmap, and its folded
2168                      * equivalent. */
2169                     TRIE_BITMAP_SET(trie, uvc);
2170
2171                     /* store the folded codepoint */
2172                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2173
2174                     if ( !UTF ) {
2175                         /* store first byte of utf8 representation of
2176                            variant codepoints */
2177                         if (! UVCHR_IS_INVARIANT(uvc)) {
2178                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2179                         }
2180                     }
2181                     set_bit = 0; /* We've done our bit :-) */
2182                 }
2183             } else {
2184
2185                 /* XXX We could come up with the list of code points that fold
2186                  * to this using PL_utf8_foldclosures, except not for
2187                  * multi-char folds, as there may be multiple combinations
2188                  * there that could work, which needs to wait until runtime to
2189                  * resolve (The comment about LIGATURE FFI above is such an
2190                  * example */
2191
2192                 SV** svpp;
2193                 if ( !widecharmap )
2194                     widecharmap = newHV();
2195
2196                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2197
2198                 if ( !svpp )
2199                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2200
2201                 if ( !SvTRUE( *svpp ) ) {
2202                     sv_setiv( *svpp, ++trie->uniquecharcount );
2203                     TRIE_STORE_REVCHAR(uvc);
2204                 }
2205             }
2206         } /* end loop through characters in this branch of the trie */
2207
2208         /* We take the min and max for this branch and combine to find the min
2209          * and max for all branches processed so far */
2210         if( cur == first ) {
2211             trie->minlen = minchars;
2212             trie->maxlen = maxchars;
2213         } else if (minchars < trie->minlen) {
2214             trie->minlen = minchars;
2215         } else if (maxchars > trie->maxlen) {
2216             trie->maxlen = maxchars;
2217         }
2218     } /* end first pass */
2219     DEBUG_TRIE_COMPILE_r(
2220         PerlIO_printf( Perl_debug_log,
2221                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2222                 (int)depth * 2 + 2,"",
2223                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2224                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2225                 (int)trie->minlen, (int)trie->maxlen )
2226     );
2227
2228     /*
2229         We now know what we are dealing with in terms of unique chars and
2230         string sizes so we can calculate how much memory a naive
2231         representation using a flat table  will take. If it's over a reasonable
2232         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2233         conservative but potentially much slower representation using an array
2234         of lists.
2235
2236         At the end we convert both representations into the same compressed
2237         form that will be used in regexec.c for matching with. The latter
2238         is a form that cannot be used to construct with but has memory
2239         properties similar to the list form and access properties similar
2240         to the table form making it both suitable for fast searches and
2241         small enough that its feasable to store for the duration of a program.
2242
2243         See the comment in the code where the compressed table is produced
2244         inplace from the flat tabe representation for an explanation of how
2245         the compression works.
2246
2247     */
2248
2249
2250     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2251     prev_states[1] = 0;
2252
2253     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2254                                                     > SvIV(re_trie_maxbuff) )
2255     {
2256         /*
2257             Second Pass -- Array Of Lists Representation
2258
2259             Each state will be represented by a list of charid:state records
2260             (reg_trie_trans_le) the first such element holds the CUR and LEN
2261             points of the allocated array. (See defines above).
2262
2263             We build the initial structure using the lists, and then convert
2264             it into the compressed table form which allows faster lookups
2265             (but cant be modified once converted).
2266         */
2267
2268         STRLEN transcount = 1;
2269
2270         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2271             "%*sCompiling trie using list compiler\n",
2272             (int)depth * 2 + 2, ""));
2273
2274         trie->states = (reg_trie_state *)
2275             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2276                                   sizeof(reg_trie_state) );
2277         TRIE_LIST_NEW(1);
2278         next_alloc = 2;
2279
2280         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2281
2282             regnode *noper   = NEXTOPER( cur );
2283             U8 *uc           = (U8*)STRING( noper );
2284             const U8 *e      = uc + STR_LEN( noper );
2285             U32 state        = 1;         /* required init */
2286             U16 charid       = 0;         /* sanity init */
2287             U32 wordlen      = 0;         /* required init */
2288
2289             if (OP(noper) == NOTHING) {
2290                 regnode *noper_next= regnext(noper);
2291                 if (noper_next != tail && OP(noper_next) == flags) {
2292                     noper = noper_next;
2293                     uc= (U8*)STRING(noper);
2294                     e= uc + STR_LEN(noper);
2295                 }
2296             }
2297
2298             if (OP(noper) != NOTHING) {
2299                 for ( ; uc < e ; uc += len ) {
2300
2301                     TRIE_READ_CHAR;
2302
2303                     if ( uvc < 256 ) {
2304                         charid = trie->charmap[ uvc ];
2305                     } else {
2306                         SV** const svpp = hv_fetch( widecharmap,
2307                                                     (char*)&uvc,
2308                                                     sizeof( UV ),
2309                                                     0);
2310                         if ( !svpp ) {
2311                             charid = 0;
2312                         } else {
2313                             charid=(U16)SvIV( *svpp );
2314                         }
2315                     }
2316                     /* charid is now 0 if we dont know the char read, or
2317                      * nonzero if we do */
2318                     if ( charid ) {
2319
2320                         U16 check;
2321                         U32 newstate = 0;
2322
2323                         charid--;
2324                         if ( !trie->states[ state ].trans.list ) {
2325                             TRIE_LIST_NEW( state );
2326                         }
2327                         for ( check = 1;
2328                               check <= TRIE_LIST_USED( state );
2329                               check++ )
2330                         {
2331                             if ( TRIE_LIST_ITEM( state, check ).forid
2332                                                                     == charid )
2333                             {
2334                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2335                                 break;
2336                             }
2337                         }
2338                         if ( ! newstate ) {
2339                             newstate = next_alloc++;
2340                             prev_states[newstate] = state;
2341                             TRIE_LIST_PUSH( state, charid, newstate );
2342                             transcount++;
2343                         }
2344                         state = newstate;
2345                     } else {
2346                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2347                     }
2348                 }
2349             }
2350             TRIE_HANDLE_WORD(state);
2351
2352         } /* end second pass */
2353
2354         /* next alloc is the NEXT state to be allocated */
2355         trie->statecount = next_alloc;
2356         trie->states = (reg_trie_state *)
2357             PerlMemShared_realloc( trie->states,
2358                                    next_alloc
2359                                    * sizeof(reg_trie_state) );
2360
2361         /* and now dump it out before we compress it */
2362         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2363                                                          revcharmap, next_alloc,
2364                                                          depth+1)
2365         );
2366
2367         trie->trans = (reg_trie_trans *)
2368             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2369         {
2370             U32 state;
2371             U32 tp = 0;
2372             U32 zp = 0;
2373
2374
2375             for( state=1 ; state < next_alloc ; state ++ ) {
2376                 U32 base=0;
2377
2378                 /*
2379                 DEBUG_TRIE_COMPILE_MORE_r(
2380                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2381                 );
2382                 */
2383
2384                 if (trie->states[state].trans.list) {
2385                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2386                     U16 maxid=minid;
2387                     U16 idx;
2388
2389                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2390                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2391                         if ( forid < minid ) {
2392                             minid=forid;
2393                         } else if ( forid > maxid ) {
2394                             maxid=forid;
2395                         }
2396                     }
2397                     if ( transcount < tp + maxid - minid + 1) {
2398                         transcount *= 2;
2399                         trie->trans = (reg_trie_trans *)
2400                             PerlMemShared_realloc( trie->trans,
2401                                                      transcount
2402                                                      * sizeof(reg_trie_trans) );
2403                         Zero( trie->trans + (transcount / 2),
2404                               transcount / 2,
2405                               reg_trie_trans );
2406                     }
2407                     base = trie->uniquecharcount + tp - minid;
2408                     if ( maxid == minid ) {
2409                         U32 set = 0;
2410                         for ( ; zp < tp ; zp++ ) {
2411                             if ( ! trie->trans[ zp ].next ) {
2412                                 base = trie->uniquecharcount + zp - minid;
2413                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2414                                                                    1).newstate;
2415                                 trie->trans[ zp ].check = state;
2416                                 set = 1;
2417                                 break;
2418                             }
2419                         }
2420                         if ( !set ) {
2421                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2422                                                                    1).newstate;
2423                             trie->trans[ tp ].check = state;
2424                             tp++;
2425                             zp = tp;
2426                         }
2427                     } else {
2428                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2429                             const U32 tid = base
2430                                            - trie->uniquecharcount
2431                                            + TRIE_LIST_ITEM( state, idx ).forid;
2432                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2433                                                                 idx ).newstate;
2434                             trie->trans[ tid ].check = state;
2435                         }
2436                         tp += ( maxid - minid + 1 );
2437                     }
2438                     Safefree(trie->states[ state ].trans.list);
2439                 }
2440                 /*
2441                 DEBUG_TRIE_COMPILE_MORE_r(
2442                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2443                 );
2444                 */
2445                 trie->states[ state ].trans.base=base;
2446             }
2447             trie->lasttrans = tp + 1;
2448         }
2449     } else {
2450         /*
2451            Second Pass -- Flat Table Representation.
2452
2453            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2454            each.  We know that we will need Charcount+1 trans at most to store
2455            the data (one row per char at worst case) So we preallocate both
2456            structures assuming worst case.
2457
2458            We then construct the trie using only the .next slots of the entry
2459            structs.
2460
2461            We use the .check field of the first entry of the node temporarily
2462            to make compression both faster and easier by keeping track of how
2463            many non zero fields are in the node.
2464
2465            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2466            transition.
2467
2468            There are two terms at use here: state as a TRIE_NODEIDX() which is
2469            a number representing the first entry of the node, and state as a
2470            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2471            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2472            if there are 2 entrys per node. eg:
2473
2474              A B       A B
2475           1. 2 4    1. 3 7
2476           2. 0 3    3. 0 5
2477           3. 0 0    5. 0 0
2478           4. 0 0    7. 0 0
2479
2480            The table is internally in the right hand, idx form. However as we
2481            also have to deal with the states array which is indexed by nodenum
2482            we have to use TRIE_NODENUM() to convert.
2483
2484         */
2485         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2486             "%*sCompiling trie using table compiler\n",
2487             (int)depth * 2 + 2, ""));
2488
2489         trie->trans = (reg_trie_trans *)
2490             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2491                                   * trie->uniquecharcount + 1,
2492                                   sizeof(reg_trie_trans) );
2493         trie->states = (reg_trie_state *)
2494             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2495                                   sizeof(reg_trie_state) );
2496         next_alloc = trie->uniquecharcount + 1;
2497
2498
2499         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2500
2501             regnode *noper   = NEXTOPER( cur );
2502             const U8 *uc     = (U8*)STRING( noper );
2503             const U8 *e      = uc + STR_LEN( noper );
2504
2505             U32 state        = 1;         /* required init */
2506
2507             U16 charid       = 0;         /* sanity init */
2508             U32 accept_state = 0;         /* sanity init */
2509
2510             U32 wordlen      = 0;         /* required init */
2511
2512             if (OP(noper) == NOTHING) {
2513                 regnode *noper_next= regnext(noper);
2514                 if (noper_next != tail && OP(noper_next) == flags) {
2515                     noper = noper_next;
2516                     uc= (U8*)STRING(noper);
2517                     e= uc + STR_LEN(noper);
2518                 }
2519             }
2520
2521             if ( OP(noper) != NOTHING ) {
2522                 for ( ; uc < e ; uc += len ) {
2523
2524                     TRIE_READ_CHAR;
2525
2526                     if ( uvc < 256 ) {
2527                         charid = trie->charmap[ uvc ];
2528                     } else {
2529                         SV* const * const svpp = hv_fetch( widecharmap,
2530                                                            (char*)&uvc,
2531                                                            sizeof( UV ),
2532                                                            0);
2533                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2534                     }
2535                     if ( charid ) {
2536                         charid--;
2537                         if ( !trie->trans[ state + charid ].next ) {
2538                             trie->trans[ state + charid ].next = next_alloc;
2539                             trie->trans[ state ].check++;
2540                             prev_states[TRIE_NODENUM(next_alloc)]
2541                                     = TRIE_NODENUM(state);
2542                             next_alloc += trie->uniquecharcount;
2543                         }
2544                         state = trie->trans[ state + charid ].next;
2545                     } else {
2546                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2547                     }
2548                     /* charid is now 0 if we dont know the char read, or
2549                      * nonzero if we do */
2550                 }
2551             }
2552             accept_state = TRIE_NODENUM( state );
2553             TRIE_HANDLE_WORD(accept_state);
2554
2555         } /* end second pass */
2556
2557         /* and now dump it out before we compress it */
2558         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2559                                                           revcharmap,
2560                                                           next_alloc, depth+1));
2561
2562         {
2563         /*
2564            * Inplace compress the table.*
2565
2566            For sparse data sets the table constructed by the trie algorithm will
2567            be mostly 0/FAIL transitions or to put it another way mostly empty.
2568            (Note that leaf nodes will not contain any transitions.)
2569
2570            This algorithm compresses the tables by eliminating most such
2571            transitions, at the cost of a modest bit of extra work during lookup:
2572
2573            - Each states[] entry contains a .base field which indicates the
2574            index in the state[] array wheres its transition data is stored.
2575
2576            - If .base is 0 there are no valid transitions from that node.
2577
2578            - If .base is nonzero then charid is added to it to find an entry in
2579            the trans array.
2580
2581            -If trans[states[state].base+charid].check!=state then the
2582            transition is taken to be a 0/Fail transition. Thus if there are fail
2583            transitions at the front of the node then the .base offset will point
2584            somewhere inside the previous nodes data (or maybe even into a node
2585            even earlier), but the .check field determines if the transition is
2586            valid.
2587
2588            XXX - wrong maybe?
2589            The following process inplace converts the table to the compressed
2590            table: We first do not compress the root node 1,and mark all its
2591            .check pointers as 1 and set its .base pointer as 1 as well. This
2592            allows us to do a DFA construction from the compressed table later,
2593            and ensures that any .base pointers we calculate later are greater
2594            than 0.
2595
2596            - We set 'pos' to indicate the first entry of the second node.
2597
2598            - We then iterate over the columns of the node, finding the first and
2599            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2600            and set the .check pointers accordingly, and advance pos
2601            appropriately and repreat for the next node. Note that when we copy
2602            the next pointers we have to convert them from the original
2603            NODEIDX form to NODENUM form as the former is not valid post
2604            compression.
2605
2606            - If a node has no transitions used we mark its base as 0 and do not
2607            advance the pos pointer.
2608
2609            - If a node only has one transition we use a second pointer into the
2610            structure to fill in allocated fail transitions from other states.
2611            This pointer is independent of the main pointer and scans forward
2612            looking for null transitions that are allocated to a state. When it
2613            finds one it writes the single transition into the "hole".  If the
2614            pointer doesnt find one the single transition is appended as normal.
2615
2616            - Once compressed we can Renew/realloc the structures to release the
2617            excess space.
2618
2619            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2620            specifically Fig 3.47 and the associated pseudocode.
2621
2622            demq
2623         */
2624         const U32 laststate = TRIE_NODENUM( next_alloc );
2625         U32 state, charid;
2626         U32 pos = 0, zp=0;
2627         trie->statecount = laststate;
2628
2629         for ( state = 1 ; state < laststate ; state++ ) {
2630             U8 flag = 0;
2631             const U32 stateidx = TRIE_NODEIDX( state );
2632             const U32 o_used = trie->trans[ stateidx ].check;
2633             U32 used = trie->trans[ stateidx ].check;
2634             trie->trans[ stateidx ].check = 0;
2635
2636             for ( charid = 0;
2637                   used && charid < trie->uniquecharcount;
2638                   charid++ )
2639             {
2640                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2641                     if ( trie->trans[ stateidx + charid ].next ) {
2642                         if (o_used == 1) {
2643                             for ( ; zp < pos ; zp++ ) {
2644                                 if ( ! trie->trans[ zp ].next ) {
2645                                     break;
2646                                 }
2647                             }
2648                             trie->states[ state ].trans.base
2649                                                     = zp
2650                                                       + trie->uniquecharcount
2651                                                       - charid ;
2652                             trie->trans[ zp ].next
2653                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2654                                                              + charid ].next );
2655                             trie->trans[ zp ].check = state;
2656                             if ( ++zp > pos ) pos = zp;
2657                             break;
2658                         }
2659                         used--;
2660                     }
2661                     if ( !flag ) {
2662                         flag = 1;
2663                         trie->states[ state ].trans.base
2664                                        = pos + trie->uniquecharcount - charid ;
2665                     }
2666                     trie->trans[ pos ].next
2667                         = SAFE_TRIE_NODENUM(
2668                                        trie->trans[ stateidx + charid ].next );
2669                     trie->trans[ pos ].check = state;
2670                     pos++;
2671                 }
2672             }
2673         }
2674         trie->lasttrans = pos + 1;
2675         trie->states = (reg_trie_state *)
2676             PerlMemShared_realloc( trie->states, laststate
2677                                    * sizeof(reg_trie_state) );
2678         DEBUG_TRIE_COMPILE_MORE_r(
2679             PerlIO_printf( Perl_debug_log,
2680                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2681                 (int)depth * 2 + 2,"",
2682                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2683                        + 1 ),
2684                 (IV)next_alloc,
2685                 (IV)pos,
2686                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2687             );
2688
2689         } /* end table compress */
2690     }
2691     DEBUG_TRIE_COMPILE_MORE_r(
2692             PerlIO_printf(Perl_debug_log,
2693                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2694                 (int)depth * 2 + 2, "",
2695                 (UV)trie->statecount,
2696                 (UV)trie->lasttrans)
2697     );
2698     /* resize the trans array to remove unused space */
2699     trie->trans = (reg_trie_trans *)
2700         PerlMemShared_realloc( trie->trans, trie->lasttrans
2701                                * sizeof(reg_trie_trans) );
2702
2703     {   /* Modify the program and insert the new TRIE node */
2704         U8 nodetype =(U8)(flags & 0xFF);
2705         char *str=NULL;
2706
2707 #ifdef DEBUGGING
2708         regnode *optimize = NULL;
2709 #ifdef RE_TRACK_PATTERN_OFFSETS
2710
2711         U32 mjd_offset = 0;
2712         U32 mjd_nodelen = 0;
2713 #endif /* RE_TRACK_PATTERN_OFFSETS */
2714 #endif /* DEBUGGING */
2715         /*
2716            This means we convert either the first branch or the first Exact,
2717            depending on whether the thing following (in 'last') is a branch
2718            or not and whther first is the startbranch (ie is it a sub part of
2719            the alternation or is it the whole thing.)
2720            Assuming its a sub part we convert the EXACT otherwise we convert
2721            the whole branch sequence, including the first.
2722          */
2723         /* Find the node we are going to overwrite */
2724         if ( first != startbranch || OP( last ) == BRANCH ) {
2725             /* branch sub-chain */
2726             NEXT_OFF( first ) = (U16)(last - first);
2727 #ifdef RE_TRACK_PATTERN_OFFSETS
2728             DEBUG_r({
2729                 mjd_offset= Node_Offset((convert));
2730                 mjd_nodelen= Node_Length((convert));
2731             });
2732 #endif
2733             /* whole branch chain */
2734         }
2735 #ifdef RE_TRACK_PATTERN_OFFSETS
2736         else {
2737             DEBUG_r({
2738                 const  regnode *nop = NEXTOPER( convert );
2739                 mjd_offset= Node_Offset((nop));
2740                 mjd_nodelen= Node_Length((nop));
2741             });
2742         }
2743         DEBUG_OPTIMISE_r(
2744             PerlIO_printf(Perl_debug_log,
2745                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2746                 (int)depth * 2 + 2, "",
2747                 (UV)mjd_offset, (UV)mjd_nodelen)
2748         );
2749 #endif
2750         /* But first we check to see if there is a common prefix we can
2751            split out as an EXACT and put in front of the TRIE node.  */
2752         trie->startstate= 1;
2753         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2754             U32 state;
2755             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2756                 U32 ofs = 0;
2757                 I32 idx = -1;
2758                 U32 count = 0;
2759                 const U32 base = trie->states[ state ].trans.base;
2760
2761                 if ( trie->states[state].wordnum )
2762                         count = 1;
2763
2764                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2765                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2766                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2767                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2768                     {
2769                         if ( ++count > 1 ) {
2770                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2771                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2772                             if ( state == 1 ) break;
2773                             if ( count == 2 ) {
2774                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2775                                 DEBUG_OPTIMISE_r(
2776                                     PerlIO_printf(Perl_debug_log,
2777                                         "%*sNew Start State=%"UVuf" Class: [",
2778                                         (int)depth * 2 + 2, "",
2779                                         (UV)state));
2780                                 if (idx >= 0) {
2781                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2782                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2783
2784                                     TRIE_BITMAP_SET(trie,*ch);
2785                                     if ( folder )
2786                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2787                                     DEBUG_OPTIMISE_r(
2788                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2789                                     );
2790                                 }
2791                             }
2792                             TRIE_BITMAP_SET(trie,*ch);
2793                             if ( folder )
2794                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2795                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2796                         }
2797                         idx = ofs;
2798                     }
2799                 }
2800                 if ( count == 1 ) {
2801                     SV **tmp = av_fetch( revcharmap, idx, 0);
2802                     STRLEN len;
2803                     char *ch = SvPV( *tmp, len );
2804                     DEBUG_OPTIMISE_r({
2805                         SV *sv=sv_newmortal();
2806                         PerlIO_printf( Perl_debug_log,
2807                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2808                             (int)depth * 2 + 2, "",
2809                             (UV)state, (UV)idx,
2810                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2811                                 PL_colors[0], PL_colors[1],
2812                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2813                                 PERL_PV_ESCAPE_FIRSTCHAR
2814                             )
2815                         );
2816                     });
2817                     if ( state==1 ) {
2818                         OP( convert ) = nodetype;
2819                         str=STRING(convert);
2820                         STR_LEN(convert)=0;
2821                     }
2822                     STR_LEN(convert) += len;
2823                     while (len--)
2824                         *str++ = *ch++;
2825                 } else {
2826 #ifdef DEBUGGING
2827                     if (state>1)
2828                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2829 #endif
2830                     break;
2831                 }
2832             }
2833             trie->prefixlen = (state-1);
2834             if (str) {
2835                 regnode *n = convert+NODE_SZ_STR(convert);
2836                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2837                 trie->startstate = state;
2838                 trie->minlen -= (state - 1);
2839                 trie->maxlen -= (state - 1);
2840 #ifdef DEBUGGING
2841                /* At least the UNICOS C compiler choked on this
2842                 * being argument to DEBUG_r(), so let's just have
2843                 * it right here. */
2844                if (
2845 #ifdef PERL_EXT_RE_BUILD
2846                    1
2847 #else
2848                    DEBUG_r_TEST
2849 #endif
2850                    ) {
2851                    regnode *fix = convert;
2852                    U32 word = trie->wordcount;
2853                    mjd_nodelen++;
2854                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2855                    while( ++fix < n ) {
2856                        Set_Node_Offset_Length(fix, 0, 0);
2857                    }
2858                    while (word--) {
2859                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2860                        if (tmp) {
2861                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2862                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2863                            else
2864                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2865                        }
2866                    }
2867                }
2868 #endif
2869                 if (trie->maxlen) {
2870                     convert = n;
2871                 } else {
2872                     NEXT_OFF(convert) = (U16)(tail - convert);
2873                     DEBUG_r(optimize= n);
2874                 }
2875             }
2876         }
2877         if (!jumper)
2878             jumper = last;
2879         if ( trie->maxlen ) {
2880             NEXT_OFF( convert ) = (U16)(tail - convert);
2881             ARG_SET( convert, data_slot );
2882             /* Store the offset to the first unabsorbed branch in
2883                jump[0], which is otherwise unused by the jump logic.
2884                We use this when dumping a trie and during optimisation. */
2885             if (trie->jump)
2886                 trie->jump[0] = (U16)(nextbranch - convert);
2887
2888             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2889              *   and there is a bitmap
2890              *   and the first "jump target" node we found leaves enough room
2891              * then convert the TRIE node into a TRIEC node, with the bitmap
2892              * embedded inline in the opcode - this is hypothetically faster.
2893              */
2894             if ( !trie->states[trie->startstate].wordnum
2895                  && trie->bitmap
2896                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2897             {
2898                 OP( convert ) = TRIEC;
2899                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2900                 PerlMemShared_free(trie->bitmap);
2901                 trie->bitmap= NULL;
2902             } else
2903                 OP( convert ) = TRIE;
2904
2905             /* store the type in the flags */
2906             convert->flags = nodetype;
2907             DEBUG_r({
2908             optimize = convert
2909                       + NODE_STEP_REGNODE
2910                       + regarglen[ OP( convert ) ];
2911             });
2912             /* XXX We really should free up the resource in trie now,
2913                    as we won't use them - (which resources?) dmq */
2914         }
2915         /* needed for dumping*/
2916         DEBUG_r(if (optimize) {
2917             regnode *opt = convert;
2918
2919             while ( ++opt < optimize) {
2920                 Set_Node_Offset_Length(opt,0,0);
2921             }
2922             /*
2923                 Try to clean up some of the debris left after the
2924                 optimisation.
2925              */
2926             while( optimize < jumper ) {
2927                 mjd_nodelen += Node_Length((optimize));
2928                 OP( optimize ) = OPTIMIZED;
2929                 Set_Node_Offset_Length(optimize,0,0);
2930                 optimize++;
2931             }
2932             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2933         });
2934     } /* end node insert */
2935
2936     /*  Finish populating the prev field of the wordinfo array.  Walk back
2937      *  from each accept state until we find another accept state, and if
2938      *  so, point the first word's .prev field at the second word. If the
2939      *  second already has a .prev field set, stop now. This will be the
2940      *  case either if we've already processed that word's accept state,
2941      *  or that state had multiple words, and the overspill words were
2942      *  already linked up earlier.
2943      */
2944     {
2945         U16 word;
2946         U32 state;
2947         U16 prev;
2948
2949         for (word=1; word <= trie->wordcount; word++) {
2950             prev = 0;
2951             if (trie->wordinfo[word].prev)
2952                 continue;
2953             state = trie->wordinfo[word].accept;
2954             while (state) {
2955                 state = prev_states[state];
2956                 if (!state)
2957                     break;
2958                 prev = trie->states[state].wordnum;
2959                 if (prev)
2960                     break;
2961             }
2962             trie->wordinfo[word].prev = prev;
2963         }
2964         Safefree(prev_states);
2965     }
2966
2967
2968     /* and now dump out the compressed format */
2969     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2970
2971     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2972 #ifdef DEBUGGING
2973     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2974     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2975 #else
2976     SvREFCNT_dec_NN(revcharmap);
2977 #endif
2978     return trie->jump
2979            ? MADE_JUMP_TRIE
2980            : trie->startstate>1
2981              ? MADE_EXACT_TRIE
2982              : MADE_TRIE;
2983 }
2984
2985 STATIC regnode *
2986 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2987 {
2988 /* The Trie is constructed and compressed now so we can build a fail array if
2989  * it's needed
2990
2991    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2992    3.32 in the
2993    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2994    Ullman 1985/88
2995    ISBN 0-201-10088-6
2996
2997    We find the fail state for each state in the trie, this state is the longest
2998    proper suffix of the current state's 'word' that is also a proper prefix of
2999    another word in our trie. State 1 represents the word '' and is thus the
3000    default fail state. This allows the DFA not to have to restart after its
3001    tried and failed a word at a given point, it simply continues as though it
3002    had been matching the other word in the first place.
3003    Consider
3004       'abcdgu'=~/abcdefg|cdgu/
3005    When we get to 'd' we are still matching the first word, we would encounter
3006    'g' which would fail, which would bring us to the state representing 'd' in
3007    the second word where we would try 'g' and succeed, proceeding to match
3008    'cdgu'.
3009  */
3010  /* add a fail transition */
3011     const U32 trie_offset = ARG(source);
3012     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3013     U32 *q;
3014     const U32 ucharcount = trie->uniquecharcount;
3015     const U32 numstates = trie->statecount;
3016     const U32 ubound = trie->lasttrans + ucharcount;
3017     U32 q_read = 0;
3018     U32 q_write = 0;
3019     U32 charid;
3020     U32 base = trie->states[ 1 ].trans.base;
3021     U32 *fail;
3022     reg_ac_data *aho;
3023     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3024     regnode *stclass;
3025     GET_RE_DEBUG_FLAGS_DECL;
3026
3027     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3028     PERL_UNUSED_CONTEXT;
3029 #ifndef DEBUGGING
3030     PERL_UNUSED_ARG(depth);
3031 #endif
3032
3033     if ( OP(source) == TRIE ) {
3034         struct regnode_1 *op = (struct regnode_1 *)
3035             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3036         StructCopy(source,op,struct regnode_1);
3037         stclass = (regnode *)op;
3038     } else {
3039         struct regnode_charclass *op = (struct regnode_charclass *)
3040             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3041         StructCopy(source,op,struct regnode_charclass);
3042         stclass = (regnode *)op;
3043     }
3044     OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3045
3046     ARG_SET( stclass, data_slot );
3047     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3048     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3049     aho->trie=trie_offset;
3050     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3051     Copy( trie->states, aho->states, numstates, reg_trie_state );
3052     Newxz( q, numstates, U32);
3053     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3054     aho->refcount = 1;
3055     fail = aho->fail;
3056     /* initialize fail[0..1] to be 1 so that we always have
3057        a valid final fail state */
3058     fail[ 0 ] = fail[ 1 ] = 1;
3059
3060     for ( charid = 0; charid < ucharcount ; charid++ ) {
3061         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3062         if ( newstate ) {
3063             q[ q_write ] = newstate;
3064             /* set to point at the root */
3065             fail[ q[ q_write++ ] ]=1;
3066         }
3067     }
3068     while ( q_read < q_write) {
3069         const U32 cur = q[ q_read++ % numstates ];
3070         base = trie->states[ cur ].trans.base;
3071
3072         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3073             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3074             if (ch_state) {
3075                 U32 fail_state = cur;
3076                 U32 fail_base;
3077                 do {
3078                     fail_state = fail[ fail_state ];
3079                     fail_base = aho->states[ fail_state ].trans.base;
3080                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3081
3082                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3083                 fail[ ch_state ] = fail_state;
3084                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3085                 {
3086                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3087                 }
3088                 q[ q_write++ % numstates] = ch_state;
3089             }
3090         }
3091     }
3092     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3093        when we fail in state 1, this allows us to use the
3094        charclass scan to find a valid start char. This is based on the principle
3095        that theres a good chance the string being searched contains lots of stuff
3096        that cant be a start char.
3097      */
3098     fail[ 0 ] = fail[ 1 ] = 0;
3099     DEBUG_TRIE_COMPILE_r({
3100         PerlIO_printf(Perl_debug_log,
3101                       "%*sStclass Failtable (%"UVuf" states): 0",
3102                       (int)(depth * 2), "", (UV)numstates
3103         );
3104         for( q_read=1; q_read<numstates; q_read++ ) {
3105             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3106         }
3107         PerlIO_printf(Perl_debug_log, "\n");
3108     });
3109     Safefree(q);
3110     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3111     return stclass;
3112 }
3113
3114
3115 #define DEBUG_PEEP(str,scan,depth) \
3116     DEBUG_OPTIMISE_r({if (scan){ \
3117        SV * const mysv=sv_newmortal(); \
3118        regnode *Next = regnext(scan); \
3119        regprop(RExC_rx, mysv, scan, NULL); \
3120        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3121        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3122        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3123    }});
3124
3125
3126 /* The below joins as many adjacent EXACTish nodes as possible into a single
3127  * one.  The regop may be changed if the node(s) contain certain sequences that
3128  * require special handling.  The joining is only done if:
3129  * 1) there is room in the current conglomerated node to entirely contain the
3130  *    next one.
3131  * 2) they are the exact same node type
3132  *
3133  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3134  * these get optimized out
3135  *
3136  * If a node is to match under /i (folded), the number of characters it matches
3137  * can be different than its character length if it contains a multi-character
3138  * fold.  *min_subtract is set to the total delta number of characters of the
3139  * input nodes.
3140  *
3141  * And *unfolded_multi_char is set to indicate whether or not the node contains
3142  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3143  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3144  * SMALL LETTER SHARP S, as only if the target string being matched against
3145  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3146  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3147  * whose components are all above the Latin1 range are not run-time locale
3148  * dependent, and have already been folded by the time this function is
3149  * called.)
3150  *
3151  * This is as good a place as any to discuss the design of handling these
3152  * multi-character fold sequences.  It's been wrong in Perl for a very long
3153  * time.  There are three code points in Unicode whose multi-character folds
3154  * were long ago discovered to mess things up.  The previous designs for
3155  * dealing with these involved assigning a special node for them.  This
3156  * approach doesn't always work, as evidenced by this example:
3157  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3158  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3159  * would match just the \xDF, it won't be able to handle the case where a
3160  * successful match would have to cross the node's boundary.  The new approach
3161  * that hopefully generally solves the problem generates an EXACTFU_SS node
3162  * that is "sss" in this case.
3163  *
3164  * It turns out that there are problems with all multi-character folds, and not
3165  * just these three.  Now the code is general, for all such cases.  The
3166  * approach taken is:
3167  * 1)   This routine examines each EXACTFish node that could contain multi-
3168  *      character folded sequences.  Since a single character can fold into
3169  *      such a sequence, the minimum match length for this node is less than
3170  *      the number of characters in the node.  This routine returns in
3171  *      *min_subtract how many characters to subtract from the the actual
3172  *      length of the string to get a real minimum match length; it is 0 if
3173  *      there are no multi-char foldeds.  This delta is used by the caller to
3174  *      adjust the min length of the match, and the delta between min and max,
3175  *      so that the optimizer doesn't reject these possibilities based on size
3176  *      constraints.
3177  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3178  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3179  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3180  *      there is a possible fold length change.  That means that a regular
3181  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3182  *      with length changes, and so can be processed faster.  regexec.c takes
3183  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3184  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3185  *      known until runtime).  This saves effort in regex matching.  However,
3186  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3187  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3188  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3189  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3190  *      possibilities for the non-UTF8 patterns are quite simple, except for
3191  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3192  *      members of a fold-pair, and arrays are set up for all of them so that
3193  *      the other member of the pair can be found quickly.  Code elsewhere in
3194  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3195  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3196  *      described in the next item.
3197  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3198  *      validity of the fold won't be known until runtime, and so must remain
3199  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3200  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3201  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3202  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3203  *      The reason this is a problem is that the optimizer part of regexec.c
3204  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3205  *      that a character in the pattern corresponds to at most a single
3206  *      character in the target string.  (And I do mean character, and not byte
3207  *      here, unlike other parts of the documentation that have never been
3208  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3209  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3210  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3211  *      nodes, violate the assumption, and they are the only instances where it
3212  *      is violated.  I'm reluctant to try to change the assumption, as the
3213  *      code involved is impenetrable to me (khw), so instead the code here
3214  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3215  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3216  *      boolean indicating whether or not the node contains such a fold.  When
3217  *      it is true, the caller sets a flag that later causes the optimizer in
3218  *      this file to not set values for the floating and fixed string lengths,
3219  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3220  *      assumption.  Thus, there is no optimization based on string lengths for
3221  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3222  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3223  *      assumption is wrong only in these cases is that all other non-UTF-8
3224  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3225  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3226  *      EXACTF nodes because we don't know at compile time if it actually
3227  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3228  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3229  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3230  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3231  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3232  *      string would require the pattern to be forced into UTF-8, the overhead
3233  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3234  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3235  *      locale.)
3236  *
3237  *      Similarly, the code that generates tries doesn't currently handle
3238  *      not-already-folded multi-char folds, and it looks like a pain to change
3239  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3240  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3241  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3242  *      using /iaa matching will be doing so almost entirely with ASCII
3243  *      strings, so this should rarely be encountered in practice */
3244
3245 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3246     if (PL_regkind[OP(scan)] == EXACT) \
3247         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3248
3249 STATIC U32
3250 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3251                    UV *min_subtract, bool *unfolded_multi_char,
3252                    U32 flags,regnode *val, U32 depth)
3253 {
3254     /* Merge several consecutive EXACTish nodes into one. */
3255     regnode *n = regnext(scan);
3256     U32 stringok = 1;
3257     regnode *next = scan + NODE_SZ_STR(scan);
3258     U32 merged = 0;
3259     U32 stopnow = 0;
3260 #ifdef DEBUGGING
3261     regnode *stop = scan;
3262     GET_RE_DEBUG_FLAGS_DECL;
3263 #else
3264     PERL_UNUSED_ARG(depth);
3265 #endif
3266
3267     PERL_ARGS_ASSERT_JOIN_EXACT;
3268 #ifndef EXPERIMENTAL_INPLACESCAN
3269     PERL_UNUSED_ARG(flags);
3270     PERL_UNUSED_ARG(val);
3271 #endif
3272     DEBUG_PEEP("join",scan,depth);
3273
3274     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3275      * EXACT ones that are mergeable to the current one. */
3276     while (n
3277            && (PL_regkind[OP(n)] == NOTHING
3278                || (stringok && OP(n) == OP(scan)))
3279            && NEXT_OFF(n)
3280            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3281     {
3282
3283         if (OP(n) == TAIL || n > next)
3284             stringok = 0;
3285         if (PL_regkind[OP(n)] == NOTHING) {
3286             DEBUG_PEEP("skip:",n,depth);
3287             NEXT_OFF(scan) += NEXT_OFF(n);
3288             next = n + NODE_STEP_REGNODE;
3289 #ifdef DEBUGGING
3290             if (stringok)
3291                 stop = n;
3292 #endif
3293             n = regnext(n);
3294         }
3295         else if (stringok) {
3296             const unsigned int oldl = STR_LEN(scan);
3297             regnode * const nnext = regnext(n);
3298
3299             /* XXX I (khw) kind of doubt that this works on platforms (should
3300              * Perl ever run on one) where U8_MAX is above 255 because of lots
3301              * of other assumptions */
3302             /* Don't join if the sum can't fit into a single node */
3303             if (oldl + STR_LEN(n) > U8_MAX)
3304                 break;
3305
3306             DEBUG_PEEP("merg",n,depth);
3307             merged++;
3308
3309             NEXT_OFF(scan) += NEXT_OFF(n);
3310             STR_LEN(scan) += STR_LEN(n);
3311             next = n + NODE_SZ_STR(n);
3312             /* Now we can overwrite *n : */
3313             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3314 #ifdef DEBUGGING
3315             stop = next - 1;
3316 #endif
3317             n = nnext;
3318             if (stopnow) break;
3319         }
3320
3321 #ifdef EXPERIMENTAL_INPLACESCAN
3322         if (flags && !NEXT_OFF(n)) {
3323             DEBUG_PEEP("atch", val, depth);
3324             if (reg_off_by_arg[OP(n)]) {
3325                 ARG_SET(n, val - n);
3326             }
3327             else {
3328                 NEXT_OFF(n) = val - n;
3329             }
3330             stopnow = 1;
3331         }
3332 #endif
3333     }
3334
3335     *min_subtract = 0;
3336     *unfolded_multi_char = FALSE;
3337
3338     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3339      * can now analyze for sequences of problematic code points.  (Prior to
3340      * this final joining, sequences could have been split over boundaries, and
3341      * hence missed).  The sequences only happen in folding, hence for any
3342      * non-EXACT EXACTish node */
3343     if (OP(scan) != EXACT) {
3344         U8* s0 = (U8*) STRING(scan);
3345         U8* s = s0;
3346         U8* s_end = s0 + STR_LEN(scan);
3347
3348         int total_count_delta = 0;  /* Total delta number of characters that
3349                                        multi-char folds expand to */
3350
3351         /* One pass is made over the node's string looking for all the
3352          * possibilities.  To avoid some tests in the loop, there are two main
3353          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3354          * non-UTF-8 */
3355         if (UTF) {
3356             U8* folded = NULL;
3357
3358             if (OP(scan) == EXACTFL) {
3359                 U8 *d;
3360
3361                 /* An EXACTFL node would already have been changed to another
3362                  * node type unless there is at least one character in it that
3363                  * is problematic; likely a character whose fold definition
3364                  * won't be known until runtime, and so has yet to be folded.
3365                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3366                  * to handle the UTF-8 case, we need to create a temporary
3367                  * folded copy using UTF-8 locale rules in order to analyze it.
3368                  * This is because our macros that look to see if a sequence is
3369                  * a multi-char fold assume everything is folded (otherwise the
3370                  * tests in those macros would be too complicated and slow).
3371                  * Note that here, the non-problematic folds will have already
3372                  * been done, so we can just copy such characters.  We actually
3373                  * don't completely fold the EXACTFL string.  We skip the
3374                  * unfolded multi-char folds, as that would just create work
3375                  * below to figure out the size they already are */
3376
3377                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3378                 d = folded;
3379                 while (s < s_end) {
3380                     STRLEN s_len = UTF8SKIP(s);
3381                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3382                         Copy(s, d, s_len, U8);
3383                         d += s_len;
3384                     }
3385                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3386                         *unfolded_multi_char = TRUE;
3387                         Copy(s, d, s_len, U8);
3388                         d += s_len;
3389                     }
3390                     else if (isASCII(*s)) {
3391                         *(d++) = toFOLD(*s);
3392                     }
3393                     else {
3394                         STRLEN len;
3395                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3396                         d += len;
3397                     }
3398                     s += s_len;
3399                 }
3400
3401                 /* Point the remainder of the routine to look at our temporary
3402                  * folded copy */
3403                 s = folded;
3404                 s_end = d;
3405             } /* End of creating folded copy of EXACTFL string */
3406
3407             /* Examine the string for a multi-character fold sequence.  UTF-8
3408              * patterns have all characters pre-folded by the time this code is
3409              * executed */
3410             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3411                                      length sequence we are looking for is 2 */
3412             {
3413                 int count = 0;  /* How many characters in a multi-char fold */
3414                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3415                 if (! len) {    /* Not a multi-char fold: get next char */
3416                     s += UTF8SKIP(s);
3417                     continue;
3418                 }
3419
3420                 /* Nodes with 'ss' require special handling, except for
3421                  * EXACTFA-ish for which there is no multi-char fold to this */
3422                 if (len == 2 && *s == 's' && *(s+1) == 's'
3423                     && OP(scan) != EXACTFA
3424                     && OP(scan) != EXACTFA_NO_TRIE)
3425                 {
3426                     count = 2;
3427                     if (OP(scan) != EXACTFL) {
3428                         OP(scan) = EXACTFU_SS;
3429                     }
3430                     s += 2;
3431                 }
3432                 else { /* Here is a generic multi-char fold. */
3433                     U8* multi_end  = s + len;
3434
3435                     /* Count how many characters are in it.  In the case of
3436                      * /aa, no folds which contain ASCII code points are
3437                      * allowed, so check for those, and skip if found. */
3438                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3439                         count = utf8_length(s, multi_end);
3440                         s = multi_end;
3441                     }
3442                     else {
3443                         while (s < multi_end) {
3444                             if (isASCII(*s)) {
3445                                 s++;
3446                                 goto next_iteration;
3447                             }
3448                             else {
3449                                 s += UTF8SKIP(s);
3450                             }
3451                             count++;
3452                         }
3453                     }
3454                 }
3455
3456                 /* The delta is how long the sequence is minus 1 (1 is how long
3457                  * the character that folds to the sequence is) */
3458                 total_count_delta += count - 1;
3459               next_iteration: ;
3460             }
3461
3462             /* We created a temporary folded copy of the string in EXACTFL
3463              * nodes.  Therefore we need to be sure it doesn't go below zero,
3464              * as the real string could be shorter */
3465             if (OP(scan) == EXACTFL) {
3466                 int total_chars = utf8_length((U8*) STRING(scan),
3467                                            (U8*) STRING(scan) + STR_LEN(scan));
3468                 if (total_count_delta > total_chars) {
3469                     total_count_delta = total_chars;
3470                 }
3471             }
3472
3473             *min_subtract += total_count_delta;
3474             Safefree(folded);
3475         }
3476         else if (OP(scan) == EXACTFA) {
3477
3478             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3479              * fold to the ASCII range (and there are no existing ones in the
3480              * upper latin1 range).  But, as outlined in the comments preceding
3481              * this function, we need to flag any occurrences of the sharp s.
3482              * This character forbids trie formation (because of added
3483              * complexity) */
3484             while (s < s_end) {
3485                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3486                     OP(scan) = EXACTFA_NO_TRIE;
3487                     *unfolded_multi_char = TRUE;
3488                     break;
3489                 }
3490                 s++;
3491                 continue;
3492             }
3493         }
3494         else {
3495
3496             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3497              * folds that are all Latin1.  As explained in the comments
3498              * preceding this function, we look also for the sharp s in EXACTF
3499              * and EXACTFL nodes; it can be in the final position.  Otherwise
3500              * we can stop looking 1 byte earlier because have to find at least
3501              * two characters for a multi-fold */
3502             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3503                               ? s_end
3504                               : s_end -1;
3505
3506             while (s < upper) {
3507                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3508                 if (! len) {    /* Not a multi-char fold. */
3509                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3510                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3511                     {
3512                         *unfolded_multi_char = TRUE;
3513                     }
3514                     s++;
3515                     continue;
3516                 }
3517
3518                 if (len == 2
3519                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3520                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3521                 {
3522
3523                     /* EXACTF nodes need to know that the minimum length
3524                      * changed so that a sharp s in the string can match this
3525                      * ss in the pattern, but they remain EXACTF nodes, as they
3526                      * won't match this unless the target string is is UTF-8,
3527                      * which we don't know until runtime.  EXACTFL nodes can't
3528                      * transform into EXACTFU nodes */
3529                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3530                         OP(scan) = EXACTFU_SS;
3531                     }
3532                 }
3533
3534                 *min_subtract += len - 1;
3535                 s += len;
3536             }
3537         }
3538     }
3539
3540 #ifdef DEBUGGING
3541     /* Allow dumping but overwriting the collection of skipped
3542      * ops and/or strings with fake optimized ops */
3543     n = scan + NODE_SZ_STR(scan);
3544     while (n <= stop) {
3545         OP(n) = OPTIMIZED;
3546         FLAGS(n) = 0;
3547         NEXT_OFF(n) = 0;
3548         n++;
3549     }
3550 #endif
3551     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3552     return stopnow;
3553 }
3554
3555 /* REx optimizer.  Converts nodes into quicker variants "in place".
3556    Finds fixed substrings.  */
3557
3558 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3559    to the position after last scanned or to NULL. */
3560
3561 #define INIT_AND_WITHP \
3562     assert(!and_withp); \
3563     Newx(and_withp,1, regnode_ssc); \
3564     SAVEFREEPV(and_withp)
3565
3566 /* this is a chain of data about sub patterns we are processing that
3567    need to be handled separately/specially in study_chunk. Its so
3568    we can simulate recursion without losing state.  */
3569 struct scan_frame;
3570 typedef struct scan_frame {
3571     regnode *last;  /* last node to process in this frame */
3572     regnode *next;  /* next node to process when last is reached */
3573     struct scan_frame *prev; /*previous frame*/
3574     U32 prev_recursed_depth;
3575     I32 stop; /* what stopparen do we use */
3576 } scan_frame;
3577
3578
3579 STATIC SSize_t
3580 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3581                         SSize_t *minlenp, SSize_t *deltap,
3582                         regnode *last,
3583                         scan_data_t *data,
3584                         I32 stopparen,
3585                         U32 recursed_depth,
3586                         regnode_ssc *and_withp,
3587                         U32 flags, U32 depth)
3588                         /* scanp: Start here (read-write). */
3589                         /* deltap: Write maxlen-minlen here. */
3590                         /* last: Stop before this one. */
3591                         /* data: string data about the pattern */
3592                         /* stopparen: treat close N as END */
3593                         /* recursed: which subroutines have we recursed into */
3594                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3595 {
3596     /* There must be at least this number of characters to match */
3597     SSize_t min = 0;
3598     I32 pars = 0, code;
3599     regnode *scan = *scanp, *next;
3600     SSize_t delta = 0;
3601     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3602     int is_inf_internal = 0;            /* The studied chunk is infinite */
3603     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3604     scan_data_t data_fake;
3605     SV *re_trie_maxbuff = NULL;
3606     regnode *first_non_open = scan;
3607     SSize_t stopmin = SSize_t_MAX;
3608     scan_frame *frame = NULL;
3609     GET_RE_DEBUG_FLAGS_DECL;
3610
3611     PERL_ARGS_ASSERT_STUDY_CHUNK;
3612
3613 #ifdef DEBUGGING
3614     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3615 #endif
3616     if ( depth == 0 ) {
3617         while (first_non_open && OP(first_non_open) == OPEN)
3618             first_non_open=regnext(first_non_open);
3619     }
3620
3621
3622   fake_study_recurse:
3623     while ( scan && OP(scan) != END && scan < last ){
3624         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3625                                    node length to get a real minimum (because
3626                                    the folded version may be shorter) */
3627         bool unfolded_multi_char = FALSE;
3628         /* Peephole optimizer: */
3629         DEBUG_OPTIMISE_MORE_r(
3630         {
3631             PerlIO_printf(Perl_debug_log,
3632                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3633                 ((int) depth*2), "", (long)stopparen,
3634                 (unsigned long)depth, (unsigned long)recursed_depth);
3635             if (recursed_depth) {
3636                 U32 i;
3637                 U32 j;
3638                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3639                     PerlIO_printf(Perl_debug_log,"[");
3640                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3641                         PerlIO_printf(Perl_debug_log,"%d",
3642                             PAREN_TEST(RExC_study_chunk_recursed +
3643                                        (j * RExC_study_chunk_recursed_bytes), i)
3644                             ? 1 : 0
3645                         );
3646                     PerlIO_printf(Perl_debug_log,"]");
3647                 }
3648             }
3649             PerlIO_printf(Perl_debug_log,"\n");
3650         }
3651         );
3652         DEBUG_STUDYDATA("Peep:", data, depth);
3653         DEBUG_PEEP("Peep", scan, depth);
3654
3655
3656         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3657          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3658          * by a different invocation of reg() -- Yves
3659          */
3660         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3661
3662         /* Follow the next-chain of the current node and optimize
3663            away all the NOTHINGs from it.  */
3664         if (OP(scan) != CURLYX) {
3665             const int max = (reg_off_by_arg[OP(scan)]
3666                        ? I32_MAX
3667                        /* I32 may be smaller than U16 on CRAYs! */
3668                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3669             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3670             int noff;
3671             regnode *n = scan;
3672
3673             /* Skip NOTHING and LONGJMP. */
3674             while ((n = regnext(n))
3675                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3676                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3677                    && off + noff < max)
3678                 off += noff;
3679             if (reg_off_by_arg[OP(scan)])
3680                 ARG(scan) = off;
3681             else
3682                 NEXT_OFF(scan) = off;
3683         }
3684
3685
3686
3687         /* The principal pseudo-switch.  Cannot be a switch, since we
3688            look into several different things.  */
3689         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3690                    || OP(scan) == IFTHEN) {
3691             next = regnext(scan);
3692             code = OP(scan);
3693             /* demq: the op(next)==code check is to see if we have
3694              * "branch-branch" AFAICT */
3695
3696             if (OP(next) == code || code == IFTHEN) {
3697                 /* NOTE - There is similar code to this block below for
3698                  * handling TRIE nodes on a re-study.  If you change stuff here
3699                  * check there too. */
3700                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3701                 regnode_ssc accum;
3702                 regnode * const startbranch=scan;
3703
3704                 if (flags & SCF_DO_SUBSTR) {
3705                     /* Cannot merge strings after this. */
3706                     scan_commit(pRExC_state, data, minlenp, is_inf);
3707                 }
3708
3709                 if (flags & SCF_DO_STCLASS)
3710                     ssc_init_zero(pRExC_state, &accum);
3711
3712                 while (OP(scan) == code) {
3713                     SSize_t deltanext, minnext, fake;
3714                     I32 f = 0;
3715                     regnode_ssc this_class;
3716
3717                     num++;
3718                     data_fake.flags = 0;
3719                     if (data) {
3720                         data_fake.whilem_c = data->whilem_c;
3721                         data_fake.last_closep = data->last_closep;
3722                     }
3723                     else
3724                         data_fake.last_closep = &fake;
3725
3726                     data_fake.pos_delta = delta;
3727                     next = regnext(scan);
3728                     scan = NEXTOPER(scan);
3729                     if (code != BRANCH)
3730                         scan = NEXTOPER(scan);
3731                     if (flags & SCF_DO_STCLASS) {
3732                         ssc_init(pRExC_state, &this_class);
3733                         data_fake.start_class = &this_class;
3734                         f = SCF_DO_STCLASS_AND;
3735                     }
3736                     if (flags & SCF_WHILEM_VISITED_POS)
3737                         f |= SCF_WHILEM_VISITED_POS;
3738
3739                     /* we suppose the run is continuous, last=next...*/
3740                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3741                                       &deltanext, next, &data_fake, stopparen,
3742                                       recursed_depth, NULL, f,depth+1);
3743                     if (min1 > minnext)
3744                         min1 = minnext;
3745                     if (deltanext == SSize_t_MAX) {
3746                         is_inf = is_inf_internal = 1;
3747                         max1 = SSize_t_MAX;
3748                     } else if (max1 < minnext + deltanext)
3749                         max1 = minnext + deltanext;
3750                     scan = next;
3751                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3752                         pars++;
3753                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3754                         if ( stopmin > minnext)
3755                             stopmin = min + min1;
3756                         flags &= ~SCF_DO_SUBSTR;
3757                         if (data)
3758                             data->flags |= SCF_SEEN_ACCEPT;
3759                     }
3760                     if (data) {
3761                         if (data_fake.flags & SF_HAS_EVAL)
3762                             data->flags |= SF_HAS_EVAL;
3763                         data->whilem_c = data_fake.whilem_c;
3764                     }
3765                     if (flags & SCF_DO_STCLASS)
3766                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3767                 }
3768                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3769                     min1 = 0;
3770                 if (flags & SCF_DO_SUBSTR) {
3771                     data->pos_min += min1;
3772                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3773                         data->pos_delta = SSize_t_MAX;
3774                     else
3775                         data->pos_delta += max1 - min1;
3776                     if (max1 != min1 || is_inf)
3777                         data->longest = &(data->longest_float);
3778                 }
3779                 min += min1;
3780                 if (delta == SSize_t_MAX
3781                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3782                     delta = SSize_t_MAX;
3783                 else
3784                     delta += max1 - min1;
3785                 if (flags & SCF_DO_STCLASS_OR) {
3786                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3787                     if (min1) {
3788                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3789                         flags &= ~SCF_DO_STCLASS;
3790                     }
3791                 }
3792                 else if (flags & SCF_DO_STCLASS_AND) {
3793                     if (min1) {
3794                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3795                         flags &= ~SCF_DO_STCLASS;
3796                     }
3797                     else {
3798                         /* Switch to OR mode: cache the old value of
3799                          * data->start_class */
3800                         INIT_AND_WITHP;
3801                         StructCopy(data->start_class, and_withp, regnode_ssc);
3802                         flags &= ~SCF_DO_STCLASS_AND;
3803                         StructCopy(&accum, data->start_class, regnode_ssc);
3804                         flags |= SCF_DO_STCLASS_OR;
3805                     }
3806                 }
3807
3808                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3809                         OP( startbranch ) == BRANCH )
3810                 {
3811                 /* demq.
3812
3813                    Assuming this was/is a branch we are dealing with: 'scan'
3814                    now points at the item that follows the branch sequence,
3815                    whatever it is. We now start at the beginning of the
3816                    sequence and look for subsequences of
3817
3818                    BRANCH->EXACT=>x1
3819                    BRANCH->EXACT=>x2
3820                    tail
3821
3822                    which would be constructed from a pattern like
3823                    /A|LIST|OF|WORDS/
3824
3825                    If we can find such a subsequence we need to turn the first
3826                    element into a trie and then add the subsequent branch exact
3827                    strings to the trie.
3828
3829                    We have two cases
3830
3831                      1. patterns where the whole set of branches can be
3832                         converted.
3833
3834                      2. patterns where only a subset can be converted.
3835
3836                    In case 1 we can replace the whole set with a single regop
3837                    for the trie. In case 2 we need to keep the start and end
3838                    branches so
3839
3840                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3841                      becomes BRANCH TRIE; BRANCH X;
3842
3843                   There is an additional case, that being where there is a
3844                   common prefix, which gets split out into an EXACT like node
3845                   preceding the TRIE node.
3846
3847                   If x(1..n)==tail then we can do a simple trie, if not we make
3848                   a "jump" trie, such that when we match the appropriate word
3849                   we "jump" to the appropriate tail node. Essentially we turn
3850                   a nested if into a case structure of sorts.
3851
3852                 */
3853
3854                     int made=0;
3855                     if (!re_trie_maxbuff) {
3856                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3857                         if (!SvIOK(re_trie_maxbuff))
3858                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3859                     }
3860                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3861                         regnode *cur;
3862                         regnode *first = (regnode *)NULL;
3863                         regnode *last = (regnode *)NULL;
3864                         regnode *tail = scan;
3865                         U8 trietype = 0;
3866                         U32 count=0;
3867
3868 #ifdef DEBUGGING
3869                         SV * const mysv = sv_newmortal();   /* for dumping */
3870 #endif
3871                         /* var tail is used because there may be a TAIL
3872                            regop in the way. Ie, the exacts will point to the
3873                            thing following the TAIL, but the last branch will
3874                            point at the TAIL. So we advance tail. If we
3875                            have nested (?:) we may have to move through several
3876                            tails.
3877                          */
3878
3879                         while ( OP( tail ) == TAIL ) {
3880                             /* this is the TAIL generated by (?:) */
3881                             tail = regnext( tail );
3882                         }
3883
3884
3885                         DEBUG_TRIE_COMPILE_r({
3886                             regprop(RExC_rx, mysv, tail, NULL);
3887                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3888                               (int)depth * 2 + 2, "",
3889                               "Looking for TRIE'able sequences. Tail node is: ",
3890                               SvPV_nolen_const( mysv )
3891                             );
3892                         });
3893
3894                         /*
3895
3896                             Step through the branches
3897                                 cur represents each branch,
3898                                 noper is the first thing to be matched as part
3899                                       of that branch
3900                                 noper_next is the regnext() of that node.
3901
3902                             We normally handle a case like this
3903                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3904                             support building with NOJUMPTRIE, which restricts
3905                             the trie logic to structures like /FOO|BAR/.
3906
3907                             If noper is a trieable nodetype then the branch is
3908                             a possible optimization target. If we are building
3909                             under NOJUMPTRIE then we require that noper_next is
3910                             the same as scan (our current position in the regex
3911                             program).
3912
3913                             Once we have two or more consecutive such branches
3914                             we can create a trie of the EXACT's contents and
3915                             stitch it in place into the program.
3916
3917                             If the sequence represents all of the branches in
3918                             the alternation we replace the entire thing with a
3919                             single TRIE node.
3920
3921                             Otherwise when it is a subsequence we need to
3922                             stitch it in place and replace only the relevant
3923                             branches. This means the first branch has to remain
3924                             as it is used by the alternation logic, and its
3925                             next pointer, and needs to be repointed at the item
3926                             on the branch chain following the last branch we
3927                             have optimized away.
3928
3929                             This could be either a BRANCH, in which case the
3930                             subsequence is internal, or it could be the item
3931                             following the branch sequence in which case the
3932                             subsequence is at the end (which does not
3933                             necessarily mean the first node is the start of the
3934                             alternation).
3935
3936                             TRIE_TYPE(X) is a define which maps the optype to a
3937                             trietype.
3938
3939                                 optype          |  trietype
3940                                 ----------------+-----------
3941                                 NOTHING         | NOTHING
3942                                 EXACT           | EXACT
3943                                 EXACTFU         | EXACTFU
3944                                 EXACTFU_SS      | EXACTFU
3945                                 EXACTFA         | EXACTFA
3946
3947
3948                         */
3949 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3950                        ( EXACT == (X) )   ? EXACT :        \
3951                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3952                        ( EXACTFA == (X) ) ? EXACTFA :        \
3953                        0 )
3954
3955                         /* dont use tail as the end marker for this traverse */
3956                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3957                             regnode * const noper = NEXTOPER( cur );
3958                             U8 noper_type = OP( noper );
3959                             U8 noper_trietype = TRIE_TYPE( noper_type );
3960 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3961                             regnode * const noper_next = regnext( noper );
3962                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3963                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3964 #endif
3965
3966                             DEBUG_TRIE_COMPILE_r({
3967                                 regprop(RExC_rx, mysv, cur, NULL);
3968                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3969                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3970
3971                                 regprop(RExC_rx, mysv, noper, NULL);
3972                                 PerlIO_printf( Perl_debug_log, " -> %s",
3973                                     SvPV_nolen_const(mysv));
3974
3975                                 if ( noper_next ) {
3976                                   regprop(RExC_rx, mysv, noper_next, NULL);
3977                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3978                                     SvPV_nolen_const(mysv));
3979                                 }
3980                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3981                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3982                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3983                                 );
3984                             });
3985
3986                             /* Is noper a trieable nodetype that can be merged
3987                              * with the current trie (if there is one)? */
3988                             if ( noper_trietype
3989                                   &&
3990                                   (
3991                                         ( noper_trietype == NOTHING)
3992                                         || ( trietype == NOTHING )
3993                                         || ( trietype == noper_trietype )
3994                                   )
3995 #ifdef NOJUMPTRIE
3996                                   && noper_next == tail
3997 #endif
3998                                   && count < U16_MAX)
3999                             {
4000                                 /* Handle mergable triable node Either we are
4001                                  * the first node in a new trieable sequence,
4002                                  * in which case we do some bookkeeping,
4003                                  * otherwise we update the end pointer. */
4004                                 if ( !first ) {
4005                                     first = cur;
4006                                     if ( noper_trietype == NOTHING ) {
4007 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4008                                         regnode * const noper_next = regnext( noper );
4009                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4010                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4011 #endif
4012
4013                                         if ( noper_next_trietype ) {
4014                                             trietype = noper_next_trietype;
4015                                         } else if (noper_next_type)  {
4016                                             /* a NOTHING regop is 1 regop wide.
4017                                              * We need at least two for a trie
4018                                              * so we can't merge this in */
4019                                             first = NULL;
4020                                         }
4021                                     } else {
4022                                         trietype = noper_trietype;
4023                                     }
4024                                 } else {
4025                                     if ( trietype == NOTHING )
4026                                         trietype = noper_trietype;
4027                                     last = cur;
4028                                 }
4029                                 if (first)
4030                                     count++;
4031                             } /* end handle mergable triable node */
4032                             else {
4033                                 /* handle unmergable node -
4034                                  * noper may either be a triable node which can
4035                                  * not be tried together with the current trie,
4036                                  * or a non triable node */
4037                                 if ( last ) {
4038                                     /* If last is set and trietype is not
4039                                      * NOTHING then we have found at least two
4040                                      * triable branch sequences in a row of a
4041                                      * similar trietype so we can turn them
4042                                      * into a trie. If/when we allow NOTHING to
4043                                      * start a trie sequence this condition
4044                                      * will be required, and it isn't expensive
4045                                      * so we leave it in for now. */
4046                                     if ( trietype && trietype != NOTHING )
4047                                         make_trie( pRExC_state,
4048                                                 startbranch, first, cur, tail,
4049                                                 count, trietype, depth+1 );
4050                                     last = NULL; /* note: we clear/update
4051                                                     first, trietype etc below,
4052                                                     so we dont do it here */
4053                                 }
4054                                 if ( noper_trietype
4055 #ifdef NOJUMPTRIE
4056                                      && noper_next == tail
4057 #endif
4058                                 ){
4059                                     /* noper is triable, so we can start a new
4060                                      * trie sequence */
4061                                     count = 1;
4062                                     first = cur;
4063                                     trietype = noper_trietype;
4064                                 } else if (first) {
4065                                     /* if we already saw a first but the
4066                                      * current node is not triable then we have
4067                                      * to reset the first information. */
4068                                     count = 0;
4069                                     first = NULL;
4070                                     trietype = 0;
4071                                 }
4072                             } /* end handle unmergable node */
4073                         } /* loop over branches */
4074                         DEBUG_TRIE_COMPILE_r({
4075                             regprop(RExC_rx, mysv, cur, NULL);
4076                             PerlIO_printf( Perl_debug_log,
4077                               "%*s- %s (%d) <SCAN FINISHED>\n",
4078                               (int)depth * 2 + 2,
4079                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4080
4081                         });
4082                         if ( last && trietype ) {
4083                             if ( trietype != NOTHING ) {
4084                                 /* the last branch of the sequence was part of
4085                                  * a trie, so we have to construct it here
4086                                  * outside of the loop */
4087                                 made= make_trie( pRExC_state, startbranch,
4088                                                  first, scan, tail, count,
4089                                                  trietype, depth+1 );
4090 #ifdef TRIE_STUDY_OPT
4091                                 if ( ((made == MADE_EXACT_TRIE &&
4092                                      startbranch == first)
4093                                      || ( first_non_open == first )) &&
4094                                      depth==0 ) {
4095                                     flags |= SCF_TRIE_RESTUDY;
4096                                     if ( startbranch == first
4097                                          && scan == tail )
4098                                     {
4099                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4100                                     }
4101                                 }
4102 #endif
4103                             } else {
4104                                 /* at this point we know whatever we have is a
4105                                  * NOTHING sequence/branch AND if 'startbranch'
4106                                  * is 'first' then we can turn the whole thing
4107                                  * into a NOTHING
4108                                  */
4109                                 if ( startbranch == first ) {
4110                                     regnode *opt;
4111                                     /* the entire thing is a NOTHING sequence,
4112                                      * something like this: (?:|) So we can
4113                                      * turn it into a plain NOTHING op. */
4114                                     DEBUG_TRIE_COMPILE_r({
4115                                         regprop(RExC_rx, mysv, cur, NULL);
4116                                         PerlIO_printf( Perl_debug_log,
4117                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4118                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4119
4120                                     });
4121                                     OP(startbranch)= NOTHING;
4122                                     NEXT_OFF(startbranch)= tail - startbranch;
4123                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4124                                         OP(opt)= OPTIMIZED;
4125                                 }
4126                             }
4127                         } /* end if ( last) */
4128                     } /* TRIE_MAXBUF is non zero */
4129
4130                 } /* do trie */
4131
4132             }
4133             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4134                 scan = NEXTOPER(NEXTOPER(scan));
4135             } else                      /* single branch is optimized. */
4136                 scan = NEXTOPER(scan);
4137             continue;
4138         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4139             scan_frame *newframe = NULL;
4140             I32 paren;
4141             regnode *start;
4142             regnode *end;
4143             U32 my_recursed_depth= recursed_depth;
4144
4145             if (OP(scan) != SUSPEND) {
4146                 /* set the pointer */
4147                 if (OP(scan) == GOSUB) {
4148                     paren = ARG(scan);
4149                     RExC_recurse[ARG2L(scan)] = scan;
4150                     start = RExC_open_parens[paren-1];
4151                     end   = RExC_close_parens[paren-1];
4152                 } else {
4153                     paren = 0;
4154                     start = RExC_rxi->program + 1;
4155                     end   = RExC_opend;
4156                 }
4157                 if (!recursed_depth
4158                     ||
4159                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4160                 ) {
4161                     if (!recursed_depth) {
4162                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4163                     } else {
4164                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4165                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4166                              RExC_study_chunk_recursed_bytes, U8);
4167                     }
4168                     /* we havent recursed into this paren yet, so recurse into it */
4169                     DEBUG_STUDYDATA("set:", data,depth);
4170                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4171                     my_recursed_depth= recursed_depth + 1;
4172                     Newx(newframe,1,scan_frame);
4173                 } else {
4174                     DEBUG_STUDYDATA("inf:", data,depth);
4175                     /* some form of infinite recursion, assume infinite length
4176                      * */
4177                     if (flags & SCF_DO_SUBSTR) {
4178                         scan_commit(pRExC_state, data, minlenp, is_inf);
4179                         data->longest = &(data->longest_float);
4180                     }
4181                     is_inf = is_inf_internal = 1;
4182                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4183                         ssc_anything(data->start_class);
4184                     flags &= ~SCF_DO_STCLASS;
4185                 }
4186             } else {
4187                 Newx(newframe,1,scan_frame);
4188                 paren = stopparen;
4189                 start = scan+2;
4190                 end = regnext(scan);
4191             }
4192             if (newframe) {
4193                 assert(start);
4194                 assert(end);
4195                 SAVEFREEPV(newframe);
4196                 newframe->next = regnext(scan);
4197                 newframe->last = last;
4198                 newframe->stop = stopparen;
4199                 newframe->prev = frame;
4200                 newframe->prev_recursed_depth = recursed_depth;
4201
4202                 DEBUG_STUDYDATA("frame-new:",data,depth);
4203                 DEBUG_PEEP("fnew", scan, depth);
4204
4205                 frame = newframe;
4206                 scan =  start;
4207                 stopparen = paren;
4208                 last = end;
4209                 depth = depth + 1;
4210                 recursed_depth= my_recursed_depth;
4211
4212                 continue;
4213             }
4214         }
4215         else if (OP(scan) == EXACT) {
4216             SSize_t l = STR_LEN(scan);
4217             UV uc;
4218             if (UTF) {
4219                 const U8 * const s = (U8*)STRING(scan);
4220                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4221                 l = utf8_length(s, s + l);
4222             } else {
4223                 uc = *((U8*)STRING(scan));
4224             }
4225             min += l;
4226             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4227                 /* The code below prefers earlier match for fixed
4228                    offset, later match for variable offset.  */
4229                 if (data->last_end == -1) { /* Update the start info. */
4230                     data->last_start_min = data->pos_min;
4231                     data->last_start_max = is_inf
4232                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4233                 }
4234                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4235                 if (UTF)
4236                     SvUTF8_on(data->last_found);
4237                 {
4238                     SV * const sv = data->last_found;
4239                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4240                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4241                     if (mg && mg->mg_len >= 0)
4242                         mg->mg_len += utf8_length((U8*)STRING(scan),
4243                                               (U8*)STRING(scan)+STR_LEN(scan));
4244                 }
4245                 data->last_end = data->pos_min + l;
4246                 data->pos_min += l; /* As in the first entry. */
4247                 data->flags &= ~SF_BEFORE_EOL;
4248             }
4249
4250             /* ANDing the code point leaves at most it, and not in locale, and
4251              * can't match null string */
4252             if (flags & SCF_DO_STCLASS_AND) {
4253                 ssc_cp_and(data->start_class, uc);
4254                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4255                 ssc_clear_locale(data->start_class);
4256             }
4257             else if (flags & SCF_DO_STCLASS_OR) {
4258                 ssc_add_cp(data->start_class, uc);
4259                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4260
4261                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4262                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4263             }
4264             flags &= ~SCF_DO_STCLASS;
4265         }
4266         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4267                                                      EXACTFish */
4268             SSize_t l = STR_LEN(scan);
4269             UV uc = *((U8*)STRING(scan));
4270             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4271                                                      separate code points */
4272             const U8 * s = (U8*)STRING(scan);
4273
4274             /* Search for fixed substrings supports EXACT only. */
4275             if (flags & SCF_DO_SUBSTR) {
4276                 assert(data);
4277                 scan_commit(pRExC_state, data, minlenp, is_inf);
4278             }
4279             if (UTF) {
4280                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4281                 l = utf8_length(s, s + l);
4282             }
4283             if (unfolded_multi_char) {
4284                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4285             }
4286             min += l - min_subtract;
4287             assert (min >= 0);
4288             delta += min_subtract;
4289             if (flags & SCF_DO_SUBSTR) {
4290                 data->pos_min += l - min_subtract;
4291                 if (data->pos_min < 0) {
4292                     data->pos_min = 0;
4293                 }
4294                 data->pos_delta += min_subtract;
4295                 if (min_subtract) {
4296                     data->longest = &(data->longest_float);
4297                 }
4298             }
4299
4300             if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4301                 ssc_clear_locale(data->start_class);
4302             }
4303
4304             if (! UTF) {
4305
4306                 /* We punt and assume can match anything if the node begins
4307                  * with a multi-character fold.  Things are complicated.  For
4308                  * example, /ffi/i could match any of:
4309                  *  "\N{LATIN SMALL LIGATURE FFI}"
4310                  *  "\N{LATIN SMALL LIGATURE FF}I"
4311                  *  "F\N{LATIN SMALL LIGATURE FI}"
4312                  *  plus several other things; and making sure we have all the
4313                  *  possibilities is hard. */
4314                 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4315                     EXACTF_invlist =
4316                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4317                 }
4318                 else {
4319
4320                     /* Any Latin1 range character can potentially match any
4321                      * other depending on the locale */
4322                     if (OP(scan) == EXACTFL) {
4323                         _invlist_union(EXACTF_invlist, PL_Latin1,
4324                                                               &EXACTF_invlist);
4325                     }
4326                     else {
4327                         /* But otherwise, it matches at least itself.  We can
4328                          * quickly tell if it has a distinct fold, and if so,
4329                          * it matches that as well */
4330                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4331                         if (IS_IN_SOME_FOLD_L1(uc)) {
4332                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4333                                                            PL_fold_latin1[uc]);
4334                         }
4335                     }
4336
4337                     /* Some characters match above-Latin1 ones under /i.  This
4338                      * is true of EXACTFL ones when the locale is UTF-8 */
4339                     if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4340                         && (! isASCII(uc) || (OP(scan) != EXACTFA
4341                                             && OP(scan) != EXACTFA_NO_TRIE)))
4342                     {
4343                         add_above_Latin1_folds(pRExC_state,
4344                                                (U8) uc,
4345                                                &EXACTF_invlist);
4346                     }
4347                 }
4348             }
4349             else {  /* Pattern is UTF-8 */
4350                 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4351                 STRLEN foldlen = UTF8SKIP(s);
4352                 const U8* e = s + STR_LEN(scan);
4353                 SV** listp;
4354
4355                 /* The only code points that aren't folded in a UTF EXACTFish
4356                  * node are are the problematic ones in EXACTFL nodes */
4357                 if (OP(scan) == EXACTFL
4358                     && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4359                 {
4360                     /* We need to check for the possibility that this EXACTFL
4361                      * node begins with a multi-char fold.  Therefore we fold
4362                      * the first few characters of it so that we can make that
4363                      * check */
4364                     U8 *d = folded;
4365                     int i;
4366
4367                     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4368                         if (isASCII(*s)) {
4369                             *(d++) = (U8) toFOLD(*s);
4370                             s++;
4371                         }
4372                         else {
4373                             STRLEN len;
4374                             to_utf8_fold(s, d, &len);
4375                             d += len;
4376                             s += UTF8SKIP(s);
4377                         }
4378                     }
4379
4380                     /* And set up so the code below that looks in this folded
4381                      * buffer instead of the node's string */
4382                     e = d;
4383                     foldlen = UTF8SKIP(folded);
4384                     s = folded;
4385                 }
4386
4387                 /* When we reach here 's' points to the fold of the first
4388                  * character(s) of the node; and 'e' points to far enough along
4389                  * the folded string to be just past any possible multi-char
4390                  * fold. 'foldlen' is the length in bytes of the first
4391                  * character in 's'
4392                  *
4393                  * Unlike the non-UTF-8 case, the macro for determining if a
4394                  * string is a multi-char fold requires all the characters to
4395                  * already be folded.  This is because of all the complications
4396                  * if not.  Note that they are folded anyway, except in EXACTFL
4397                  * nodes.  Like the non-UTF case above, we punt if the node
4398                  * begins with a multi-char fold  */
4399
4400                 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4401                     EXACTF_invlist =
4402                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4403                 }
4404                 else {  /* Single char fold */
4405
4406                     /* It matches all the things that fold to it, which are
4407                      * found in PL_utf8_foldclosures (including itself) */
4408                     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4409                     if (! PL_utf8_foldclosures) {
4410                         _load_PL_utf8_foldclosures();
4411                     }
4412                     if ((listp = hv_fetch(PL_utf8_foldclosures,
4413                                         (char *) s, foldlen, FALSE)))
4414                     {
4415                         AV* list = (AV*) *listp;
4416                         IV k;
4417                         for (k = 0; k <= av_tindex(list); k++) {
4418                             SV** c_p = av_fetch(list, k, FALSE);
4419                             UV c;
4420                             assert(c_p);
4421
4422                             c = SvUV(*c_p);
4423
4424                             /* /aa doesn't allow folds between ASCII and non- */
4425                             if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4426                                 && isASCII(c) != isASCII(uc))
4427                             {
4428                                 continue;
4429                             }
4430
4431                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4432                         }
4433                     }
4434                 }
4435             }
4436             if (flags & SCF_DO_STCLASS_AND) {
4437                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4438                 ANYOF_POSIXL_ZERO(data->start_class);
4439                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4440             }
4441             else if (flags & SCF_DO_STCLASS_OR) {
4442                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4443                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4444
4445                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4446                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4447             }
4448             flags &= ~SCF_DO_STCLASS;
4449             SvREFCNT_dec(EXACTF_invlist);
4450         }
4451         else if (REGNODE_VARIES(OP(scan))) {
4452             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4453             I32 fl = 0, f = flags;
4454             regnode * const oscan = scan;
4455             regnode_ssc this_class;
4456             regnode_ssc *oclass = NULL;
4457             I32 next_is_eval = 0;
4458
4459             switch (PL_regkind[OP(scan)]) {
4460             case WHILEM:                /* End of (?:...)* . */
4461                 scan = NEXTOPER(scan);
4462                 goto finish;
4463             case PLUS:
4464                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4465                     next = NEXTOPER(scan);
4466                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4467                         mincount = 1;
4468                         maxcount = REG_INFTY;
4469                         next = regnext(scan);
4470                         scan = NEXTOPER(scan);
4471                         goto do_curly;
4472                     }
4473                 }
4474                 if (flags & SCF_DO_SUBSTR)
4475                     data->pos_min++;
4476                 min++;
4477                 /* FALLTHROUGH */
4478             case STAR:
4479                 if (flags & SCF_DO_STCLASS) {
4480                     mincount = 0;
4481                     maxcount = REG_INFTY;
4482                     next = regnext(scan);
4483                     scan = NEXTOPER(scan);
4484                     goto do_curly;
4485                 }
4486                 if (flags & SCF_DO_SUBSTR) {
4487                     scan_commit(pRExC_state, data, minlenp, is_inf);
4488                     /* Cannot extend fixed substrings */
4489                     data->longest = &(data->longest_float);
4490                 }
4491                 is_inf = is_inf_internal = 1;
4492                 scan = regnext(scan);
4493                 goto optimize_curly_tail;
4494             case CURLY:
4495                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4496                     && (scan->flags == stopparen))
4497                 {
4498                     mincount = 1;
4499                     maxcount = 1;
4500                 } else {
4501                     mincount = ARG1(scan);
4502                     maxcount = ARG2(scan);
4503                 }
4504                 next = regnext(scan);
4505                 if (OP(scan) == CURLYX) {
4506                     I32 lp = (data ? *(data->last_closep) : 0);
4507                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4508                 }
4509                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4510                 next_is_eval = (OP(scan) == EVAL);
4511               do_curly:
4512                 if (flags & SCF_DO_SUBSTR) {
4513                     if (mincount == 0)
4514                         scan_commit(pRExC_state, data, minlenp, is_inf);
4515                     /* Cannot extend fixed substrings */
4516                     pos_before = data->pos_min;
4517                 }
4518                 if (data) {
4519                     fl = data->flags;
4520                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4521                     if (is_inf)
4522                         data->flags |= SF_IS_INF;
4523                 }
4524                 if (flags & SCF_DO_STCLASS) {
4525                     ssc_init(pRExC_state, &this_class);
4526                     oclass = data->start_class;
4527                     data->start_class = &this_class;
4528                     f |= SCF_DO_STCLASS_AND;
4529                     f &= ~SCF_DO_STCLASS_OR;
4530                 }
4531                 /* Exclude from super-linear cache processing any {n,m}
4532                    regops for which the combination of input pos and regex
4533                    pos is not enough information to determine if a match
4534                    will be possible.
4535
4536                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4537                    regex pos at the \s*, the prospects for a match depend not
4538                    only on the input position but also on how many (bar\s*)
4539                    repeats into the {4,8} we are. */
4540                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4541                     f &= ~SCF_WHILEM_VISITED_POS;
4542
4543                 /* This will finish on WHILEM, setting scan, or on NULL: */
4544                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4545                                   last, data, stopparen, recursed_depth, NULL,
4546                                   (mincount == 0
4547                                    ? (f & ~SCF_DO_SUBSTR)
4548                                    : f)
4549                                   ,depth+1);
4550
4551                 if (flags & SCF_DO_STCLASS)
4552                     data->start_class = oclass;
4553                 if (mincount == 0 || minnext == 0) {
4554                     if (flags & SCF_DO_STCLASS_OR) {
4555                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4556                     }
4557                     else if (flags & SCF_DO_STCLASS_AND) {
4558                         /* Switch to OR mode: cache the old value of
4559                          * data->start_class */
4560                         INIT_AND_WITHP;
4561                         StructCopy(data->start_class, and_withp, regnode_ssc);
4562                         flags &= ~SCF_DO_STCLASS_AND;
4563                         StructCopy(&this_class, data->start_class, regnode_ssc);
4564                         flags |= SCF_DO_STCLASS_OR;
4565                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4566                     }
4567                 } else {                /* Non-zero len */
4568                     if (flags & SCF_DO_STCLASS_OR) {
4569                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4570                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4571                     }
4572                     else if (flags & SCF_DO_STCLASS_AND)
4573                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4574                     flags &= ~SCF_DO_STCLASS;
4575                 }
4576                 if (!scan)              /* It was not CURLYX, but CURLY. */
4577                     scan = next;
4578                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4579                     /* ? quantifier ok, except for (?{ ... }) */
4580                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4581                     && (minnext == 0) && (deltanext == 0)
4582                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4583                     && maxcount <= REG_INFTY/3) /* Complement check for big
4584                                                    count */
4585                 {
4586                     /* Fatal warnings may leak the regexp without this: */
4587                     SAVEFREESV(RExC_rx_sv);
4588                     ckWARNreg(RExC_parse,
4589                             "Quantifier unexpected on zero-length expression");
4590                     (void)ReREFCNT_inc(RExC_rx_sv);
4591                 }
4592
4593                 min += minnext * mincount;
4594                 is_inf_internal |= deltanext == SSize_t_MAX
4595                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4596                 is_inf |= is_inf_internal;
4597                 if (is_inf) {
4598                     delta = SSize_t_MAX;
4599                 } else {
4600                     delta += (minnext + deltanext) * maxcount
4601                              - minnext * mincount;
4602                 }
4603                 /* Try powerful optimization CURLYX => CURLYN. */
4604                 if (  OP(oscan) == CURLYX && data
4605                       && data->flags & SF_IN_PAR
4606                       && !(data->flags & SF_HAS_EVAL)
4607                       && !deltanext && minnext == 1 ) {
4608                     /* Try to optimize to CURLYN.  */
4609                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4610                     regnode * const nxt1 = nxt;
4611 #ifdef DEBUGGING
4612                     regnode *nxt2;
4613 #endif
4614
4615                     /* Skip open. */
4616                     nxt = regnext(nxt);
4617                     if (!REGNODE_SIMPLE(OP(nxt))
4618                         && !(PL_regkind[OP(nxt)] == EXACT
4619                              && STR_LEN(nxt) == 1))
4620                         goto nogo;
4621 #ifdef DEBUGGING
4622                     nxt2 = nxt;
4623 #endif
4624                     nxt = regnext(nxt);
4625                     if (OP(nxt) != CLOSE)
4626                         goto nogo;
4627                     if (RExC_open_parens) {
4628                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4629                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4630                     }
4631                     /* Now we know that nxt2 is the only contents: */
4632                     oscan->flags = (U8)ARG(nxt);
4633                     OP(oscan) = CURLYN;
4634                     OP(nxt1) = NOTHING; /* was OPEN. */
4635
4636 #ifdef DEBUGGING
4637                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4638                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4639                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4640                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4641                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4642                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4643 #endif
4644                 }
4645               nogo:
4646
4647                 /* Try optimization CURLYX => CURLYM. */
4648                 if (  OP(oscan) == CURLYX && data
4649                       && !(data->flags & SF_HAS_PAR)
4650                       && !(data->flags & SF_HAS_EVAL)
4651                       && !deltanext     /* atom is fixed width */
4652                       && minnext != 0   /* CURLYM can't handle zero width */
4653
4654                          /* Nor characters whose fold at run-time may be
4655                           * multi-character */
4656                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4657                 ) {
4658                     /* XXXX How to optimize if data == 0? */
4659                     /* Optimize to a simpler form.  */
4660                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4661                     regnode *nxt2;
4662
4663                     OP(oscan) = CURLYM;
4664                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4665                             && (OP(nxt2) != WHILEM))
4666                         nxt = nxt2;
4667                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4668                     /* Need to optimize away parenths. */
4669                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4670                         /* Set the parenth number.  */
4671                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4672
4673                         oscan->flags = (U8)ARG(nxt);
4674                         if (RExC_open_parens) {
4675                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4676                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4677                         }
4678                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4679                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4680
4681 #ifdef DEBUGGING
4682                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4683                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4684                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4685                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4686 #endif
4687 #if 0
4688                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4689                             regnode *nnxt = regnext(nxt1);
4690                             if (nnxt == nxt) {
4691                                 if (reg_off_by_arg[OP(nxt1)])
4692                                     ARG_SET(nxt1, nxt2 - nxt1);
4693                                 else if (nxt2 - nxt1 < U16_MAX)
4694                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4695                                 else
4696                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4697                             }
4698                             nxt1 = nnxt;
4699                         }
4700 #endif
4701                         /* Optimize again: */
4702                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4703                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4704                     }
4705                     else
4706                         oscan->flags = 0;
4707                 }
4708                 else if ((OP(oscan) == CURLYX)
4709                          && (flags & SCF_WHILEM_VISITED_POS)
4710                          /* See the comment on a similar expression above.
4711                             However, this time it's not a subexpression
4712                             we care about, but the expression itself. */
4713                          && (maxcount == REG_INFTY)
4714                          && data && ++data->whilem_c < 16) {
4715                     /* This stays as CURLYX, we can put the count/of pair. */
4716                     /* Find WHILEM (as in regexec.c) */
4717                     regnode *nxt = oscan + NEXT_OFF(oscan);
4718
4719                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4720                         nxt += ARG(nxt);
4721                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4722                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4723                 }
4724                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4725                     pars++;
4726                 if (flags & SCF_DO_SUBSTR) {
4727                     SV *last_str = NULL;
4728                     STRLEN last_chrs = 0;
4729                     int counted = mincount != 0;
4730
4731                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4732                                                                   string. */
4733                         SSize_t b = pos_before >= data->last_start_min
4734                             ? pos_before : data->last_start_min;
4735                         STRLEN l;
4736                         const char * const s = SvPV_const(data->last_found, l);
4737                         SSize_t old = b - data->last_start_min;
4738
4739                         if (UTF)
4740                             old = utf8_hop((U8*)s, old) - (U8*)s;
4741                         l -= old;
4742                         /* Get the added string: */
4743                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4744                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4745                                             (U8*)(s + old + l)) : l;
4746                         if (deltanext == 0 && pos_before == b) {
4747                             /* What was added is a constant string */
4748                             if (mincount > 1) {
4749
4750                                 SvGROW(last_str, (mincount * l) + 1);
4751                                 repeatcpy(SvPVX(last_str) + l,
4752                                           SvPVX_const(last_str), l,
4753                                           mincount - 1);
4754                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4755                                 /* Add additional parts. */
4756                                 SvCUR_set(data->last_found,
4757                                           SvCUR(data->last_found) - l);
4758                                 sv_catsv(data->last_found, last_str);
4759                                 {
4760                                     SV * sv = data->last_found;
4761                                     MAGIC *mg =
4762                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4763                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4764                                     if (mg && mg->mg_len >= 0)
4765                                         mg->mg_len += last_chrs * (mincount-1);
4766                                 }
4767                                 last_chrs *= mincount;
4768                                 data->last_end += l * (mincount - 1);
4769                             }
4770                         } else {
4771                             /* start offset must point into the last copy */
4772                             data->last_start_min += minnext * (mincount - 1);
4773                             data->last_start_max += is_inf ? SSize_t_MAX
4774                                 : (maxcount - 1) * (minnext + data->pos_delta);
4775                         }
4776                     }
4777                     /* It is counted once already... */
4778                     data->pos_min += minnext * (mincount - counted);
4779 #if 0
4780 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4781                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4782                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
4783     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4784     (UV)mincount);
4785 if (deltanext != SSize_t_MAX)
4786 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4787     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4788           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4789 #endif
4790                     if (deltanext == SSize_t_MAX
4791                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4792                         data->pos_delta = SSize_t_MAX;
4793                     else
4794                         data->pos_delta += - counted * deltanext +
4795                         (minnext + deltanext) * maxcount - minnext * mincount;
4796                     if (mincount != maxcount) {
4797                          /* Cannot extend fixed substrings found inside
4798                             the group.  */
4799                         scan_commit(pRExC_state, data, minlenp, is_inf);
4800                         if (mincount && last_str) {
4801                             SV * const sv = data->last_found;
4802                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4803                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4804
4805                             if (mg)
4806                                 mg->mg_len = -1;
4807                             sv_setsv(sv, last_str);
4808                             data->last_end = data->pos_min;
4809                             data->last_start_min = data->pos_min - last_chrs;
4810                             data->last_start_max = is_inf
4811                                 ? SSize_t_MAX
4812                                 : data->pos_min + data->pos_delta - last_chrs;
4813                         }
4814                         data->longest = &(data->longest_float);
4815                     }
4816                     SvREFCNT_dec(last_str);
4817                 }
4818                 if (data && (fl & SF_HAS_EVAL))
4819                     data->flags |= SF_HAS_EVAL;
4820               optimize_curly_tail:
4821                 if (OP(oscan) != CURLYX) {
4822                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4823                            && NEXT_OFF(next))
4824                         NEXT_OFF(oscan) += NEXT_OFF(next);
4825                 }
4826                 continue;
4827
4828             default:
4829 #ifdef DEBUGGING
4830                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4831                                                                     OP(scan));
4832 #endif
4833             case REF:
4834             case CLUMP:
4835                 if (flags & SCF_DO_SUBSTR) {
4836                     /* Cannot expect anything... */
4837                     scan_commit(pRExC_state, data, minlenp, is_inf);
4838                     data->longest = &(data->longest_float);
4839                 }
4840                 is_inf = is_inf_internal = 1;
4841                 if (flags & SCF_DO_STCLASS_OR) {
4842                     if (OP(scan) == CLUMP) {
4843                         /* Actually is any start char, but very few code points
4844                          * aren't start characters */
4845                         ssc_match_all_cp(data->start_class);
4846                     }
4847                     else {
4848                         ssc_anything(data->start_class);
4849                     }
4850                 }
4851                 flags &= ~SCF_DO_STCLASS;
4852                 break;
4853             }
4854         }
4855         else if (OP(scan) == LNBREAK) {
4856             if (flags & SCF_DO_STCLASS) {
4857                 if (flags & SCF_DO_STCLASS_AND) {
4858                     ssc_intersection(data->start_class,
4859                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4860                     ssc_clear_locale(data->start_class);
4861                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4862                 }
4863                 else if (flags & SCF_DO_STCLASS_OR) {
4864                     ssc_union(data->start_class,
4865                               PL_XPosix_ptrs[_CC_VERTSPACE],
4866                               FALSE);
4867                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4868
4869                     /* See commit msg for
4870                      * 749e076fceedeb708a624933726e7989f2302f6a */
4871                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4872                 }
4873                 flags &= ~SCF_DO_STCLASS;
4874             }
4875             min++;
4876             delta++;    /* Because of the 2 char string cr-lf */
4877             if (flags & SCF_DO_SUBSTR) {
4878                 /* Cannot expect anything... */
4879                 scan_commit(pRExC_state, data, minlenp, is_inf);
4880                 data->pos_min += 1;
4881                 data->pos_delta += 1;
4882                 data->longest = &(data->longest_float);
4883             }
4884         }
4885         else if (REGNODE_SIMPLE(OP(scan))) {
4886
4887             if (flags & SCF_DO_SUBSTR) {
4888                 scan_commit(pRExC_state, data, minlenp, is_inf);
4889                 data->pos_min++;
4890             }
4891             min++;
4892             if (flags & SCF_DO_STCLASS) {
4893                 bool invert = 0;
4894                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4895                 U8 namedclass;
4896
4897                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4898                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4899
4900                 /* Some of the logic below assumes that switching
4901                    locale on will only add false positives. */
4902                 switch (OP(scan)) {
4903
4904                 default:
4905 #ifdef DEBUGGING
4906                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4907                                                                      OP(scan));
4908 #endif
4909                 case CANY:
4910                 case SANY:
4911                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4912                         ssc_match_all_cp(data->start_class);
4913                     break;
4914
4915                 case REG_ANY:
4916                     {
4917                         SV* REG_ANY_invlist = _new_invlist(2);
4918                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4919                                                             '\n');
4920                         if (flags & SCF_DO_STCLASS_OR) {
4921                             ssc_union(data->start_class,
4922                                       REG_ANY_invlist,
4923                                       TRUE /* TRUE => invert, hence all but \n
4924                                             */
4925                                       );
4926                         }
4927                         else if (flags & SCF_DO_STCLASS_AND) {
4928                             ssc_intersection(data->start_class,
4929                                              REG_ANY_invlist,
4930                                              TRUE  /* TRUE => invert */
4931                                              );
4932                             ssc_clear_locale(data->start_class);
4933                         }
4934                         SvREFCNT_dec_NN(REG_ANY_invlist);
4935                     }
4936                     break;
4937
4938                 case ANYOF:
4939                     if (flags & SCF_DO_STCLASS_AND)
4940                         ssc_and(pRExC_state, data->start_class,
4941                                 (regnode_charclass *) scan);
4942                     else
4943                         ssc_or(pRExC_state, data->start_class,
4944                                                           (regnode_charclass *) scan);
4945                     break;
4946
4947                 case NPOSIXL:
4948                     invert = 1;
4949                     /* FALLTHROUGH */
4950
4951                 case POSIXL:
4952                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4953                     if (flags & SCF_DO_STCLASS_AND) {
4954                         bool was_there = cBOOL(
4955                                           ANYOF_POSIXL_TEST(data->start_class,
4956                                                                  namedclass));
4957                         ANYOF_POSIXL_ZERO(data->start_class);
4958                         if (was_there) {    /* Do an AND */
4959                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4960                         }
4961                         /* No individual code points can now match */
4962                         data->start_class->invlist
4963                                                 = sv_2mortal(_new_invlist(0));
4964                     }
4965                     else {
4966                         int complement = namedclass + ((invert) ? -1 : 1);
4967
4968                         assert(flags & SCF_DO_STCLASS_OR);
4969
4970                         /* If the complement of this class was already there,
4971                          * the result is that they match all code points,
4972                          * (\d + \D == everything).  Remove the classes from
4973                          * future consideration.  Locale is not relevant in
4974                          * this case */
4975                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4976                             ssc_match_all_cp(data->start_class);
4977                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4978                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4979                         }
4980                         else {  /* The usual case; just add this class to the
4981                                    existing set */
4982                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4983                         }
4984                     }
4985                     break;
4986
4987                 case NPOSIXA:   /* For these, we always know the exact set of
4988                                    what's matched */
4989                     invert = 1;
4990                     /* FALLTHROUGH */
4991                 case POSIXA:
4992                     if (FLAGS(scan) == _CC_ASCII) {
4993                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4994                     }
4995                     else {
4996                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4997                                               PL_XPosix_ptrs[_CC_ASCII],
4998                                               &my_invlist);
4999                     }
5000                     goto join_posix;
5001
5002                 case NPOSIXD:
5003                 case NPOSIXU:
5004                     invert = 1;
5005                     /* FALLTHROUGH */
5006                 case POSIXD:
5007                 case POSIXU:
5008                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5009
5010                     /* NPOSIXD matches all upper Latin1 code points unless the
5011                      * target string being matched is UTF-8, which is
5012                      * unknowable until match time.  Since we are going to
5013                      * invert, we want to get rid of all of them so that the
5014                      * inversion will match all */
5015                     if (OP(scan) == NPOSIXD) {
5016                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5017                                           &my_invlist);
5018                     }
5019
5020                   join_posix:
5021
5022                     if (flags & SCF_DO_STCLASS_AND) {
5023                         ssc_intersection(data->start_class, my_invlist, invert);
5024                         ssc_clear_locale(data->start_class);
5025                     }
5026                     else {
5027                         assert(flags & SCF_DO_STCLASS_OR);
5028                         ssc_union(data->start_class, my_invlist, invert);
5029                     }
5030                 }
5031                 if (flags & SCF_DO_STCLASS_OR)
5032                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5033                 flags &= ~SCF_DO_STCLASS;
5034             }
5035         }
5036         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5037             data->flags |= (OP(scan) == MEOL
5038                             ? SF_BEFORE_MEOL
5039                             : SF_BEFORE_SEOL);
5040             scan_commit(pRExC_state, data, minlenp, is_inf);
5041
5042         }
5043         else if (  PL_regkind[OP(scan)] == BRANCHJ
5044                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5045                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5046                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
5047             if ( OP(scan) == UNLESSM &&
5048                  scan->flags == 0 &&
5049                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5050                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5051             ) {
5052                 regnode *opt;
5053                 regnode *upto= regnext(scan);
5054                 DEBUG_PARSE_r({
5055                     SV * const mysv_val=sv_newmortal();
5056                     DEBUG_STUDYDATA("OPFAIL",data,depth);
5057
5058                     /*DEBUG_PARSE_MSG("opfail");*/
5059                     regprop(RExC_rx, mysv_val, upto, NULL);
5060                     PerlIO_printf(Perl_debug_log,
5061                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5062                         SvPV_nolen_const(mysv_val),
5063                         (IV)REG_NODE_NUM(upto),
5064                         (IV)(upto - scan)
5065                     );
5066                 });
5067                 OP(scan) = OPFAIL;
5068                 NEXT_OFF(scan) = upto - scan;
5069                 for (opt= scan + 1; opt < upto ; opt++)
5070                     OP(opt) = OPTIMIZED;
5071                 scan= upto;
5072                 continue;
5073             }
5074             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5075                 || OP(scan) == UNLESSM )
5076             {
5077                 /* Negative Lookahead/lookbehind
5078                    In this case we can't do fixed string optimisation.
5079                 */
5080
5081                 SSize_t deltanext, minnext, fake = 0;
5082                 regnode *nscan;
5083                 regnode_ssc intrnl;
5084                 int f = 0;
5085
5086                 data_fake.flags = 0;
5087                 if (data) {
5088                     data_fake.whilem_c = data->whilem_c;
5089                     data_fake.last_closep = data->last_closep;
5090                 }
5091                 else
5092                     data_fake.last_closep = &fake;
5093                 data_fake.pos_delta = delta;
5094                 if ( flags & SCF_DO_STCLASS && !scan->flags
5095                      && OP(scan) == IFMATCH ) { /* Lookahead */
5096                     ssc_init(pRExC_state, &intrnl);
5097                     data_fake.start_class = &intrnl;
5098                     f |= SCF_DO_STCLASS_AND;
5099                 }
5100                 if (flags & SCF_WHILEM_VISITED_POS)
5101                     f |= SCF_WHILEM_VISITED_POS;
5102                 next = regnext(scan);
5103                 nscan = NEXTOPER(NEXTOPER(scan));
5104                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5105                                       last, &data_fake, stopparen,
5106                                       recursed_depth, NULL, f, depth+1);
5107                 if (scan->flags) {
5108                     if (deltanext) {
5109                         FAIL("Variable length lookbehind not implemented");
5110                     }
5111                     else if (minnext > (I32)U8_MAX) {
5112                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5113                               (UV)U8_MAX);
5114                     }
5115                     scan->flags = (U8)minnext;
5116                 }
5117                 if (data) {
5118                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5119                         pars++;
5120                     if (data_fake.flags & SF_HAS_EVAL)
5121                         data->flags |= SF_HAS_EVAL;
5122                     data->whilem_c = data_fake.whilem_c;
5123                 }
5124                 if (f & SCF_DO_STCLASS_AND) {
5125                     if (flags & SCF_DO_STCLASS_OR) {
5126                         /* OR before, AND after: ideally we would recurse with
5127                          * data_fake to get the AND applied by study of the
5128                          * remainder of the pattern, and then derecurse;
5129                          * *** HACK *** for now just treat as "no information".
5130                          * See [perl #56690].
5131                          */
5132                         ssc_init(pRExC_state, data->start_class);
5133                     }  else {
5134                         /* AND before and after: combine and continue */
5135                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5136                     }
5137                 }
5138             }
5139 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5140             else {
5141                 /* Positive Lookahead/lookbehind
5142                    In this case we can do fixed string optimisation,
5143                    but we must be careful about it. Note in the case of
5144                    lookbehind the positions will be offset by the minimum
5145                    length of the pattern, something we won't know about
5146                    until after the recurse.
5147                 */
5148                 SSize_t deltanext, fake = 0;
5149                 regnode *nscan;
5150                 regnode_ssc intrnl;
5151                 int f = 0;
5152                 /* We use SAVEFREEPV so that when the full compile
5153                     is finished perl will clean up the allocated
5154                     minlens when it's all done. This way we don't
5155                     have to worry about freeing them when we know
5156                     they wont be used, which would be a pain.
5157                  */
5158                 SSize_t *minnextp;
5159                 Newx( minnextp, 1, SSize_t );
5160                 SAVEFREEPV(minnextp);
5161
5162                 if (data) {
5163                     StructCopy(data, &data_fake, scan_data_t);
5164                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5165                         f |= SCF_DO_SUBSTR;
5166                         if (scan->flags)
5167                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5168                         data_fake.last_found=newSVsv(data->last_found);
5169                     }
5170                 }
5171                 else
5172                     data_fake.last_closep = &fake;
5173                 data_fake.flags = 0;
5174                 data_fake.pos_delta = delta;
5175                 if (is_inf)
5176                     data_fake.flags |= SF_IS_INF;
5177                 if ( flags & SCF_DO_STCLASS && !scan->flags
5178                      && OP(scan) == IFMATCH ) { /* Lookahead */
5179                     ssc_init(pRExC_state, &intrnl);
5180                     data_fake.start_class = &intrnl;
5181                     f |= SCF_DO_STCLASS_AND;
5182                 }
5183                 if (flags & SCF_WHILEM_VISITED_POS)
5184                     f |= SCF_WHILEM_VISITED_POS;
5185                 next = regnext(scan);
5186                 nscan = NEXTOPER(NEXTOPER(scan));
5187
5188                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5189                                         &deltanext, last, &data_fake,
5190                                         stopparen, recursed_depth, NULL,
5191                                         f,depth+1);
5192                 if (scan->flags) {
5193                     if (deltanext) {
5194                         FAIL("Variable length lookbehind not implemented");
5195                     }
5196                     else if (*minnextp > (I32)U8_MAX) {
5197                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5198                               (UV)U8_MAX);
5199                     }
5200                     scan->flags = (U8)*minnextp;
5201                 }
5202
5203                 *minnextp += min;
5204
5205                 if (f & SCF_DO_STCLASS_AND) {
5206                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5207                 }
5208                 if (data) {
5209                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5210                         pars++;
5211                     if (data_fake.flags & SF_HAS_EVAL)
5212                         data->flags |= SF_HAS_EVAL;
5213                     data->whilem_c = data_fake.whilem_c;
5214                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5215                         if (RExC_rx->minlen<*minnextp)
5216                             RExC_rx->minlen=*minnextp;
5217                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5218                         SvREFCNT_dec_NN(data_fake.last_found);
5219
5220                         if ( data_fake.minlen_fixed != minlenp )
5221                         {
5222                             data->offset_fixed= data_fake.offset_fixed;
5223                             data->minlen_fixed= data_fake.minlen_fixed;
5224                             data->lookbehind_fixed+= scan->flags;
5225                         }
5226                         if ( data_fake.minlen_float != minlenp )
5227                         {
5228                             data->minlen_float= data_fake.minlen_float;
5229                             data->offset_float_min=data_fake.offset_float_min;
5230                             data->offset_float_max=data_fake.offset_float_max;
5231                             data->lookbehind_float+= scan->flags;
5232                         }
5233                     }
5234                 }
5235             }
5236 #endif
5237         }
5238         else if (OP(scan) == OPEN) {
5239             if (stopparen != (I32)ARG(scan))
5240                 pars++;
5241         }
5242         else if (OP(scan) == CLOSE) {
5243             if (stopparen == (I32)ARG(scan)) {
5244                 break;
5245             }
5246             if ((I32)ARG(scan) == is_par) {
5247                 next = regnext(scan);
5248
5249                 if ( next && (OP(next) != WHILEM) && next < last)
5250                     is_par = 0;         /* Disable optimization */
5251             }
5252             if (data)
5253                 *(data->last_closep) = ARG(scan);
5254         }
5255         else if (OP(scan) == EVAL) {
5256                 if (data)
5257                     data->flags |= SF_HAS_EVAL;
5258         }
5259         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5260             if (flags & SCF_DO_SUBSTR) {
5261                 scan_commit(pRExC_state, data, minlenp, is_inf);
5262                 flags &= ~SCF_DO_SUBSTR;
5263             }
5264             if (data && OP(scan)==ACCEPT) {
5265                 data->flags |= SCF_SEEN_ACCEPT;
5266                 if (stopmin > min)
5267                     stopmin = min;
5268             }
5269         }
5270         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5271         {
5272                 if (flags & SCF_DO_SUBSTR) {
5273                     scan_commit(pRExC_state, data, minlenp, is_inf);
5274                     data->longest = &(data->longest_float);
5275                 }
5276                 is_inf = is_inf_internal = 1;
5277                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5278                     ssc_anything(data->start_class);
5279                 flags &= ~SCF_DO_STCLASS;
5280         }
5281         else if (OP(scan) == GPOS) {
5282             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5283                 !(delta || is_inf || (data && data->pos_delta)))
5284             {
5285                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5286                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5287                 if (RExC_rx->gofs < (STRLEN)min)
5288                     RExC_rx->gofs = min;
5289             } else {
5290                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5291                 RExC_rx->gofs = 0;
5292             }
5293         }
5294 #ifdef TRIE_STUDY_OPT
5295 #ifdef FULL_TRIE_STUDY
5296         else if (PL_regkind[OP(scan)] == TRIE) {
5297             /* NOTE - There is similar code to this block above for handling
5298                BRANCH nodes on the initial study.  If you change stuff here
5299                check there too. */
5300             regnode *trie_node= scan;
5301             regnode *tail= regnext(scan);
5302             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5303             SSize_t max1 = 0, min1 = SSize_t_MAX;
5304             regnode_ssc accum;
5305
5306             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5307                 /* Cannot merge strings after this. */
5308                 scan_commit(pRExC_state, data, minlenp, is_inf);
5309             }
5310             if (flags & SCF_DO_STCLASS)
5311                 ssc_init_zero(pRExC_state, &accum);
5312
5313             if (!trie->jump) {
5314                 min1= trie->minlen;
5315                 max1= trie->maxlen;
5316             } else {
5317                 const regnode *nextbranch= NULL;
5318                 U32 word;
5319
5320                 for ( word=1 ; word <= trie->wordcount ; word++)
5321                 {
5322                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5323                     regnode_ssc this_class;
5324
5325                     data_fake.flags = 0;
5326                     if (data) {
5327                         data_fake.whilem_c = data->whilem_c;
5328                         data_fake.last_closep = data->last_closep;
5329                     }
5330                     else
5331                         data_fake.last_closep = &fake;
5332                     data_fake.pos_delta = delta;
5333                     if (flags & SCF_DO_STCLASS) {
5334                         ssc_init(pRExC_state, &this_class);
5335                         data_fake.start_class = &this_class;
5336                         f = SCF_DO_STCLASS_AND;
5337                     }
5338                     if (flags & SCF_WHILEM_VISITED_POS)
5339                         f |= SCF_WHILEM_VISITED_POS;
5340
5341                     if (trie->jump[word]) {
5342                         if (!nextbranch)
5343                             nextbranch = trie_node + trie->jump[0];
5344                         scan= trie_node + trie->jump[word];
5345                         /* We go from the jump point to the branch that follows
5346                            it. Note this means we need the vestigal unused
5347                            branches even though they arent otherwise used. */
5348                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5349                             &deltanext, (regnode *)nextbranch, &data_fake,
5350                             stopparen, recursed_depth, NULL, f,depth+1);
5351                     }
5352                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5353                         nextbranch= regnext((regnode*)nextbranch);
5354
5355                     if (min1 > (SSize_t)(minnext + trie->minlen))
5356                         min1 = minnext + trie->minlen;
5357                     if (deltanext == SSize_t_MAX) {
5358                         is_inf = is_inf_internal = 1;
5359                         max1 = SSize_t_MAX;
5360                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5361                         max1 = minnext + deltanext + trie->maxlen;
5362
5363                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5364                         pars++;
5365                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5366                         if ( stopmin > min + min1)
5367                             stopmin = min + min1;
5368                         flags &= ~SCF_DO_SUBSTR;
5369                         if (data)
5370                             data->flags |= SCF_SEEN_ACCEPT;
5371                     }
5372                     if (data) {
5373                         if (data_fake.flags & SF_HAS_EVAL)
5374                             data->flags |= SF_HAS_EVAL;
5375                         data->whilem_c = data_fake.whilem_c;
5376                     }
5377                     if (flags & SCF_DO_STCLASS)
5378                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5379                 }
5380             }
5381             if (flags & SCF_DO_SUBSTR) {
5382                 data->pos_min += min1;
5383                 data->pos_delta += max1 - min1;
5384                 if (max1 != min1 || is_inf)
5385                     data->longest = &(data->longest_float);
5386             }
5387             min += min1;
5388             delta += max1 - min1;
5389             if (flags & SCF_DO_STCLASS_OR) {
5390                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5391                 if (min1) {
5392                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5393                     flags &= ~SCF_DO_STCLASS;
5394                 }
5395             }
5396             else if (flags & SCF_DO_STCLASS_AND) {
5397                 if (min1) {
5398                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5399                     flags &= ~SCF_DO_STCLASS;
5400                 }
5401                 else {
5402                     /* Switch to OR mode: cache the old value of
5403                      * data->start_class */
5404                     INIT_AND_WITHP;
5405                     StructCopy(data->start_class, and_withp, regnode_ssc);
5406                     flags &= ~SCF_DO_STCLASS_AND;
5407                     StructCopy(&accum, data->start_class, regnode_ssc);
5408                     flags |= SCF_DO_STCLASS_OR;
5409                 }
5410             }
5411             scan= tail;
5412             continue;
5413         }
5414 #else
5415         else if (PL_regkind[OP(scan)] == TRIE) {
5416             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5417             U8*bang=NULL;
5418
5419             min += trie->minlen;
5420             delta += (trie->maxlen - trie->minlen);
5421             flags &= ~SCF_DO_STCLASS; /* xxx */
5422             if (flags & SCF_DO_SUBSTR) {
5423                 /* Cannot expect anything... */
5424                 scan_commit(pRExC_state, data, minlenp, is_inf);
5425                 data->pos_min += trie->minlen;
5426                 data->pos_delta += (trie->maxlen - trie->minlen);
5427                 if (trie->maxlen != trie->minlen)
5428                     data->longest = &(data->longest_float);
5429             }
5430             if (trie->jump) /* no more substrings -- for now /grr*/
5431                flags &= ~SCF_DO_SUBSTR;
5432         }
5433 #endif /* old or new */
5434 #endif /* TRIE_STUDY_OPT */
5435
5436         /* Else: zero-length, ignore. */
5437         scan = regnext(scan);
5438     }
5439     /* If we are exiting a recursion we can unset its recursed bit
5440      * and allow ourselves to enter it again - no danger of an
5441      * infinite loop there.
5442     if (stopparen > -1 && recursed) {
5443         DEBUG_STUDYDATA("unset:", data,depth);
5444         PAREN_UNSET( recursed, stopparen);
5445     }
5446     */
5447     if (frame) {
5448         DEBUG_STUDYDATA("frame-end:",data,depth);
5449         DEBUG_PEEP("fend", scan, depth);
5450         /* restore previous context */
5451         last = frame->last;
5452         scan = frame->next;
5453         stopparen = frame->stop;
5454         recursed_depth = frame->prev_recursed_depth;
5455         depth = depth - 1;
5456
5457         frame = frame->prev;
5458         goto fake_study_recurse;
5459     }
5460
5461   finish:
5462     assert(!frame);
5463     DEBUG_STUDYDATA("pre-fin:",data,depth);
5464
5465     *scanp = scan;
5466     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5467
5468     if (flags & SCF_DO_SUBSTR && is_inf)
5469         data->pos_delta = SSize_t_MAX - data->pos_min;
5470     if (is_par > (I32)U8_MAX)
5471         is_par = 0;
5472     if (is_par && pars==1 && data) {
5473         data->flags |= SF_IN_PAR;
5474         data->flags &= ~SF_HAS_PAR;
5475     }
5476     else if (pars && data) {
5477         data->flags |= SF_HAS_PAR;
5478         data->flags &= ~SF_IN_PAR;
5479     }
5480     if (flags & SCF_DO_STCLASS_OR)
5481         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5482     if (flags & SCF_TRIE_RESTUDY)
5483         data->flags |=  SCF_TRIE_RESTUDY;
5484
5485     DEBUG_STUDYDATA("post-fin:",data,depth);
5486
5487     {
5488         SSize_t final_minlen= min < stopmin ? min : stopmin;
5489
5490         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5491             RExC_maxlen = final_minlen + delta;
5492         }
5493         return final_minlen;
5494     }
5495     /* not-reached */
5496 }
5497
5498 STATIC U32
5499 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5500 {
5501     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5502
5503     PERL_ARGS_ASSERT_ADD_DATA;
5504
5505     Renewc(RExC_rxi->data,
5506            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5507            char, struct reg_data);
5508     if(count)
5509         Renew(RExC_rxi->data->what, count + n, U8);
5510     else
5511         Newx(RExC_rxi->data->what, n, U8);
5512     RExC_rxi->data->count = count + n;
5513     Copy(s, RExC_rxi->data->what + count, n, U8);
5514     return count;
5515 }
5516
5517 /*XXX: todo make this not included in a non debugging perl, but appears to be
5518  * used anyway there, in 'use re' */
5519 #ifndef PERL_IN_XSUB_RE
5520 void
5521 Perl_reginitcolors(pTHX)
5522 {
5523     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5524     if (s) {
5525         char *t = savepv(s);
5526         int i = 0;
5527         PL_colors[0] = t;
5528         while (++i < 6) {
5529             t = strchr(t, '\t');
5530             if (t) {
5531                 *t = '\0';
5532                 PL_colors[i] = ++t;
5533             }
5534             else
5535                 PL_colors[i] = t = (char *)"";
5536         }
5537     } else {
5538         int i = 0;
5539         while (i < 6)
5540             PL_colors[i++] = (char *)"";
5541     }
5542     PL_colorset = 1;
5543 }
5544 #endif
5545
5546
5547 #ifdef TRIE_STUDY_OPT
5548 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5549     STMT_START {                                            \
5550         if (                                                \
5551               (data.flags & SCF_TRIE_RESTUDY)               \
5552               && ! restudied++                              \
5553         ) {                                                 \
5554             dOsomething;                                    \
5555             goto reStudy;                                   \
5556         }                                                   \
5557     } STMT_END
5558 #else
5559 #define CHECK_RESTUDY_GOTO_butfirst
5560 #endif
5561
5562 /*
5563  * pregcomp - compile a regular expression into internal code
5564  *
5565  * Decides which engine's compiler to call based on the hint currently in
5566  * scope
5567  */
5568
5569 #ifndef PERL_IN_XSUB_RE
5570
5571 /* return the currently in-scope regex engine (or the default if none)  */
5572
5573 regexp_engine const *
5574 Perl_current_re_engine(pTHX)
5575 {
5576     if (IN_PERL_COMPILETIME) {
5577         HV * const table = GvHV(PL_hintgv);
5578         SV **ptr;
5579
5580         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5581             return &PL_core_reg_engine;
5582         ptr = hv_fetchs(table, "regcomp", FALSE);
5583         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5584             return &PL_core_reg_engine;
5585         return INT2PTR(regexp_engine*,SvIV(*ptr));
5586     }
5587     else {
5588         SV *ptr;
5589         if (!PL_curcop->cop_hints_hash)
5590             return &PL_core_reg_engine;
5591         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5592         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5593             return &PL_core_reg_engine;
5594         return INT2PTR(regexp_engine*,SvIV(ptr));
5595     }
5596 }
5597
5598
5599 REGEXP *
5600 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5601 {
5602     regexp_engine const *eng = current_re_engine();
5603     GET_RE_DEBUG_FLAGS_DECL;
5604
5605     PERL_ARGS_ASSERT_PREGCOMP;
5606
5607     /* Dispatch a request to compile a regexp to correct regexp engine. */
5608     DEBUG_COMPILE_r({
5609         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5610                         PTR2UV(eng));
5611     });
5612     return CALLREGCOMP_ENG(eng, pattern, flags);
5613 }
5614 #endif
5615
5616 /* public(ish) entry point for the perl core's own regex compiling code.
5617  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5618  * pattern rather than a list of OPs, and uses the internal engine rather
5619  * than the current one */
5620
5621 REGEXP *
5622 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5623 {
5624     SV *pat = pattern; /* defeat constness! */
5625     PERL_ARGS_ASSERT_RE_COMPILE;
5626     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5627 #ifdef PERL_IN_XSUB_RE
5628                                 &my_reg_engine,
5629 #else
5630                                 &PL_core_reg_engine,
5631 #endif
5632                                 NULL, NULL, rx_flags, 0);
5633 }
5634
5635
5636 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5637  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5638  * point to the realloced string and length.
5639  *
5640  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5641  * stuff added */
5642
5643 static void
5644 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5645                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5646 {
5647     U8 *const src = (U8*)*pat_p;
5648     U8 *dst;
5649     int n=0;
5650     STRLEN s = 0, d = 0;
5651     bool do_end = 0;
5652     GET_RE_DEBUG_FLAGS_DECL;
5653
5654     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5655         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5656
5657     Newx(dst, *plen_p * 2 + 1, U8);
5658
5659     while (s < *plen_p) {
5660         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5661             dst[d]   = src[s];
5662         else {
5663             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5664             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5665         }
5666         if (n < num_code_blocks) {
5667             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5668                 pRExC_state->code_blocks[n].start = d;
5669                 assert(dst[d] == '(');
5670                 do_end = 1;
5671             }
5672             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5673                 pRExC_state->code_blocks[n].end = d;
5674                 assert(dst[d] == ')');
5675                 do_end = 0;
5676                 n++;
5677             }
5678         }
5679         s++;
5680         d++;
5681     }
5682     dst[d] = '\0';
5683     *plen_p = d;
5684     *pat_p = (char*) dst;
5685     SAVEFREEPV(*pat_p);
5686     RExC_orig_utf8 = RExC_utf8 = 1;
5687 }
5688
5689
5690
5691 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5692  * while recording any code block indices, and handling overloading,
5693  * nested qr// objects etc.  If pat is null, it will allocate a new
5694  * string, or just return the first arg, if there's only one.
5695  *
5696  * Returns the malloced/updated pat.
5697  * patternp and pat_count is the array of SVs to be concatted;
5698  * oplist is the optional list of ops that generated the SVs;
5699  * recompile_p is a pointer to a boolean that will be set if
5700  *   the regex will need to be recompiled.
5701  * delim, if non-null is an SV that will be inserted between each element
5702  */
5703
5704 static SV*
5705 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5706                 SV *pat, SV ** const patternp, int pat_count,
5707                 OP *oplist, bool *recompile_p, SV *delim)
5708 {
5709     SV **svp;
5710     int n = 0;
5711     bool use_delim = FALSE;
5712     bool alloced = FALSE;
5713
5714     /* if we know we have at least two args, create an empty string,
5715      * then concatenate args to that. For no args, return an empty string */
5716     if (!pat && pat_count != 1) {
5717         pat = newSVpvs("");
5718         SAVEFREESV(pat);
5719         alloced = TRUE;
5720     }
5721
5722     for (svp = patternp; svp < patternp + pat_count; svp++) {
5723         SV *sv;
5724         SV *rx  = NULL;
5725         STRLEN orig_patlen = 0;
5726         bool code = 0;
5727         SV *msv = use_delim ? delim : *svp;
5728         if (!msv) msv = &PL_sv_undef;
5729
5730         /* if we've got a delimiter, we go round the loop twice for each
5731          * svp slot (except the last), using the delimiter the second
5732          * time round */
5733         if (use_delim) {
5734             svp--;
5735             use_delim = FALSE;
5736         }
5737         else if (delim)
5738             use_delim = TRUE;
5739
5740         if (SvTYPE(msv) == SVt_PVAV) {
5741             /* we've encountered an interpolated array within
5742              * the pattern, e.g. /...@a..../. Expand the list of elements,
5743              * then recursively append elements.
5744              * The code in this block is based on S_pushav() */
5745
5746             AV *const av = (AV*)msv;
5747             const SSize_t maxarg = AvFILL(av) + 1;
5748             SV **array;
5749
5750             if (oplist) {
5751                 assert(oplist->op_type == OP_PADAV
5752                     || oplist->op_type == OP_RV2AV);
5753                 oplist = OP_SIBLING(oplist);
5754             }
5755
5756             if (SvRMAGICAL(av)) {
5757                 SSize_t i;
5758
5759                 Newx(array, maxarg, SV*);
5760                 SAVEFREEPV(array);
5761                 for (i=0; i < maxarg; i++) {
5762                     SV ** const svp = av_fetch(av, i, FALSE);
5763                     array[i] = svp ? *svp : &PL_sv_undef;
5764                 }
5765             }
5766             else
5767                 array = AvARRAY(av);
5768
5769             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5770                                 array, maxarg, NULL, recompile_p,
5771                                 /* $" */
5772                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5773
5774             continue;
5775         }
5776
5777
5778         /* we make the assumption here that each op in the list of
5779          * op_siblings maps to one SV pushed onto the stack,
5780          * except for code blocks, with have both an OP_NULL and
5781          * and OP_CONST.
5782          * This allows us to match up the list of SVs against the
5783          * list of OPs to find the next code block.
5784          *
5785          * Note that       PUSHMARK PADSV PADSV ..
5786          * is optimised to
5787          *                 PADRANGE PADSV  PADSV  ..
5788          * so the alignment still works. */
5789
5790         if (oplist) {
5791             if (oplist->op_type == OP_NULL
5792                 && (oplist->op_flags & OPf_SPECIAL))
5793             {
5794                 assert(n < pRExC_state->num_code_blocks);
5795                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5796                 pRExC_state->code_blocks[n].block = oplist;
5797                 pRExC_state->code_blocks[n].src_regex = NULL;
5798                 n++;
5799                 code = 1;
5800                 oplist = OP_SIBLING(oplist); /* skip CONST */
5801                 assert(oplist);
5802             }
5803             oplist = OP_SIBLING(oplist);;
5804         }
5805
5806         /* apply magic and QR overloading to arg */
5807
5808         SvGETMAGIC(msv);
5809         if (SvROK(msv) && SvAMAGIC(msv)) {
5810             SV *sv = AMG_CALLunary(msv, regexp_amg);
5811             if (sv) {
5812                 if (SvROK(sv))
5813                     sv = SvRV(sv);
5814                 if (SvTYPE(sv) != SVt_REGEXP)
5815                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5816                 msv = sv;
5817             }
5818         }
5819
5820         /* try concatenation overload ... */
5821         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5822                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5823         {
5824             sv_setsv(pat, sv);
5825             /* overloading involved: all bets are off over literal
5826              * code. Pretend we haven't seen it */
5827             pRExC_state->num_code_blocks -= n;
5828             n = 0;
5829         }
5830         else  {
5831             /* ... or failing that, try "" overload */
5832             while (SvAMAGIC(msv)
5833                     && (sv = AMG_CALLunary(msv, string_amg))
5834                     && sv != msv
5835                     &&  !(   SvROK(msv)
5836                           && SvROK(sv)
5837                           && SvRV(msv) == SvRV(sv))
5838             ) {
5839                 msv = sv;
5840                 SvGETMAGIC(msv);
5841             }
5842             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5843                 msv = SvRV(msv);
5844
5845             if (pat) {
5846                 /* this is a partially unrolled
5847                  *     sv_catsv_nomg(pat, msv);
5848                  * that allows us to adjust code block indices if
5849                  * needed */
5850                 STRLEN dlen;
5851                 char *dst = SvPV_force_nomg(pat, dlen);
5852                 orig_patlen = dlen;
5853                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5854                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5855                     sv_setpvn(pat, dst, dlen);
5856                     SvUTF8_on(pat);
5857                 }
5858                 sv_catsv_nomg(pat, msv);
5859                 rx = msv;
5860             }
5861             else
5862                 pat = msv;
5863
5864             if (code)
5865                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5866         }
5867
5868         /* extract any code blocks within any embedded qr//'s */
5869         if (rx && SvTYPE(rx) == SVt_REGEXP
5870             && RX_ENGINE((REGEXP*)rx)->op_comp)
5871         {
5872
5873             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5874             if (ri->num_code_blocks) {
5875                 int i;
5876                 /* the presence of an embedded qr// with code means
5877                  * we should always recompile: the text of the
5878                  * qr// may not have changed, but it may be a
5879                  * different closure than last time */
5880                 *recompile_p = 1;
5881                 Renew(pRExC_state->code_blocks,
5882                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5883                     struct reg_code_block);
5884                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5885
5886                 for (i=0; i < ri->num_code_blocks; i++) {
5887                     struct reg_code_block *src, *dst;
5888                     STRLEN offset =  orig_patlen
5889                         + ReANY((REGEXP *)rx)->pre_prefix;
5890                     assert(n < pRExC_state->num_code_blocks);
5891                     src = &ri->code_blocks[i];
5892                     dst = &pRExC_state->code_blocks[n];
5893                     dst->start      = src->start + offset;
5894                     dst->end        = src->end   + offset;
5895                     dst->block      = src->block;
5896                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5897                                             src->src_regex
5898                                                 ? src->src_regex
5899                                                 : (REGEXP*)rx);
5900                     n++;
5901                 }
5902             }
5903         }
5904     }
5905     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5906     if (alloced)
5907         SvSETMAGIC(pat);
5908
5909     return pat;
5910 }
5911
5912
5913
5914 /* see if there are any run-time code blocks in the pattern.
5915  * False positives are allowed */
5916
5917 static bool
5918 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5919                     char *pat, STRLEN plen)
5920 {
5921     int n = 0;
5922     STRLEN s;
5923
5924     PERL_UNUSED_CONTEXT;
5925
5926     for (s = 0; s < plen; s++) {
5927         if (n < pRExC_state->num_code_blocks
5928             && s == pRExC_state->code_blocks[n].start)
5929         {
5930             s = pRExC_state->code_blocks[n].end;
5931             n++;
5932             continue;
5933         }
5934         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5935          * positives here */
5936         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5937             (pat[s+2] == '{'
5938                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5939         )
5940             return 1;
5941     }
5942     return 0;
5943 }
5944
5945 /* Handle run-time code blocks. We will already have compiled any direct
5946  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5947  * copy of it, but with any literal code blocks blanked out and
5948  * appropriate chars escaped; then feed it into
5949  *
5950  *    eval "qr'modified_pattern'"
5951  *
5952  * For example,
5953  *
5954  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5955  *
5956  * becomes
5957  *
5958  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5959  *
5960  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5961  * and merge them with any code blocks of the original regexp.
5962  *
5963  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5964  * instead, just save the qr and return FALSE; this tells our caller that
5965  * the original pattern needs upgrading to utf8.
5966  */
5967
5968 static bool
5969 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5970     char *pat, STRLEN plen)
5971 {
5972     SV *qr;
5973
5974     GET_RE_DEBUG_FLAGS_DECL;
5975
5976     if (pRExC_state->runtime_code_qr) {
5977         /* this is the second time we've been called; this should
5978          * only happen if the main pattern got upgraded to utf8
5979          * during compilation; re-use the qr we compiled first time
5980          * round (which should be utf8 too)
5981          */
5982         qr = pRExC_state->runtime_code_qr;
5983         pRExC_state->runtime_code_qr = NULL;
5984         assert(RExC_utf8 && SvUTF8(qr));
5985     }
5986     else {
5987         int n = 0;
5988         STRLEN s;
5989         char *p, *newpat;
5990         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5991         SV *sv, *qr_ref;
5992         dSP;
5993
5994         /* determine how many extra chars we need for ' and \ escaping */
5995         for (s = 0; s < plen; s++) {
5996             if (pat[s] == '\'' || pat[s] == '\\')
5997                 newlen++;
5998         }
5999
6000         Newx(newpat, newlen, char);
6001         p = newpat;
6002         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6003
6004         for (s = 0; s < plen; s++) {
6005             if (n < pRExC_state->num_code_blocks
6006                 && s == pRExC_state->code_blocks[n].start)
6007             {
6008                 /* blank out literal code block */
6009                 assert(pat[s] == '(');
6010                 while (s <= pRExC_state->code_blocks[n].end) {
6011                     *p++ = '_';
6012                     s++;
6013                 }
6014                 s--;
6015                 n++;
6016                 continue;
6017             }
6018             if (pat[s] == '\'' || pat[s] == '\\')
6019                 *p++ = '\\';
6020             *p++ = pat[s];
6021         }
6022         *p++ = '\'';
6023         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6024             *p++ = 'x';
6025         *p++ = '\0';
6026         DEBUG_COMPILE_r({
6027             PerlIO_printf(Perl_debug_log,
6028                 "%sre-parsing pattern for runtime code:%s %s\n",
6029                 PL_colors[4],PL_colors[5],newpat);
6030         });
6031
6032         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6033         Safefree(newpat);
6034
6035         ENTER;
6036         SAVETMPS;
6037         save_re_context();
6038         PUSHSTACKi(PERLSI_REQUIRE);
6039         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6040          * parsing qr''; normally only q'' does this. It also alters
6041          * hints handling */
6042         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6043         SvREFCNT_dec_NN(sv);
6044         SPAGAIN;
6045         qr_ref = POPs;
6046         PUTBACK;
6047         {
6048             SV * const errsv = ERRSV;
6049             if (SvTRUE_NN(errsv))
6050             {
6051                 Safefree(pRExC_state->code_blocks);
6052                 /* use croak_sv ? */
6053                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6054             }
6055         }
6056         assert(SvROK(qr_ref));
6057         qr = SvRV(qr_ref);
6058         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6059         /* the leaving below frees the tmp qr_ref.
6060          * Give qr a life of its own */
6061         SvREFCNT_inc(qr);
6062         POPSTACK;
6063         FREETMPS;
6064         LEAVE;
6065
6066     }
6067
6068     if (!RExC_utf8 && SvUTF8(qr)) {
6069         /* first time through; the pattern got upgraded; save the
6070          * qr for the next time through */
6071         assert(!pRExC_state->runtime_code_qr);
6072         pRExC_state->runtime_code_qr = qr;
6073         return 0;
6074     }
6075
6076
6077     /* extract any code blocks within the returned qr//  */
6078
6079
6080     /* merge the main (r1) and run-time (r2) code blocks into one */
6081     {
6082         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6083         struct reg_code_block *new_block, *dst;
6084         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6085         int i1 = 0, i2 = 0;
6086
6087         if (!r2->num_code_blocks) /* we guessed wrong */
6088         {
6089             SvREFCNT_dec_NN(qr);
6090             return 1;
6091         }
6092
6093         Newx(new_block,
6094             r1->num_code_blocks + r2->num_code_blocks,
6095             struct reg_code_block);
6096         dst = new_block;
6097
6098         while (    i1 < r1->num_code_blocks
6099                 || i2 < r2->num_code_blocks)
6100         {
6101             struct reg_code_block *src;
6102             bool is_qr = 0;
6103
6104             if (i1 == r1->num_code_blocks) {
6105                 src = &r2->code_blocks[i2++];
6106                 is_qr = 1;
6107             }
6108             else if (i2 == r2->num_code_blocks)
6109                 src = &r1->code_blocks[i1++];
6110             else if (  r1->code_blocks[i1].start
6111                      < r2->code_blocks[i2].start)
6112             {
6113                 src = &r1->code_blocks[i1++];
6114                 assert(src->end < r2->code_blocks[i2].start);
6115             }
6116             else {
6117                 assert(  r1->code_blocks[i1].start
6118                        > r2->code_blocks[i2].start);
6119                 src = &r2->code_blocks[i2++];
6120                 is_qr = 1;
6121                 assert(src->end < r1->code_blocks[i1].start);
6122             }
6123
6124             assert(pat[src->start] == '(');
6125             assert(pat[src->end]   == ')');
6126             dst->start      = src->start;
6127             dst->end        = src->end;
6128             dst->block      = src->block;
6129             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6130                                     : src->src_regex;
6131             dst++;
6132         }
6133         r1->num_code_blocks += r2->num_code_blocks;
6134         Safefree(r1->code_blocks);
6135         r1->code_blocks = new_block;
6136     }
6137
6138     SvREFCNT_dec_NN(qr);
6139     return 1;
6140 }
6141
6142
6143 STATIC bool
6144 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6145                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6146                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6147                       STRLEN longest_length, bool eol, bool meol)
6148 {
6149     /* This is the common code for setting up the floating and fixed length
6150      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6151      * as to whether succeeded or not */
6152
6153     I32 t;
6154     SSize_t ml;
6155
6156     if (! (longest_length
6157            || (eol /* Can't have SEOL and MULTI */
6158                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6159           )
6160             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6161         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6162     {
6163         return FALSE;
6164     }
6165
6166     /* copy the information about the longest from the reg_scan_data
6167         over to the program. */
6168     if (SvUTF8(sv_longest)) {
6169         *rx_utf8 = sv_longest;
6170         *rx_substr = NULL;
6171     } else {
6172         *rx_substr = sv_longest;
6173         *rx_utf8 = NULL;
6174     }
6175     /* end_shift is how many chars that must be matched that
6176         follow this item. We calculate it ahead of time as once the
6177         lookbehind offset is added in we lose the ability to correctly
6178         calculate it.*/
6179     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6180     *rx_end_shift = ml - offset
6181         - longest_length + (SvTAIL(sv_longest) != 0)
6182         + lookbehind;
6183
6184     t = (eol/* Can't have SEOL and MULTI */
6185          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6186     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6187
6188     return TRUE;
6189 }
6190
6191 /*
6192  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6193  * regular expression into internal code.
6194  * The pattern may be passed either as:
6195  *    a list of SVs (patternp plus pat_count)
6196  *    a list of OPs (expr)
6197  * If both are passed, the SV list is used, but the OP list indicates
6198  * which SVs are actually pre-compiled code blocks
6199  *
6200  * The SVs in the list have magic and qr overloading applied to them (and
6201  * the list may be modified in-place with replacement SVs in the latter
6202  * case).
6203  *
6204  * If the pattern hasn't changed from old_re, then old_re will be
6205  * returned.
6206  *
6207  * eng is the current engine. If that engine has an op_comp method, then
6208  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6209  * do the initial concatenation of arguments and pass on to the external
6210  * engine.
6211  *
6212  * If is_bare_re is not null, set it to a boolean indicating whether the
6213  * arg list reduced (after overloading) to a single bare regex which has
6214  * been returned (i.e. /$qr/).
6215  *
6216  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6217  *
6218  * pm_flags contains the PMf_* flags, typically based on those from the
6219  * pm_flags field of the related PMOP. Currently we're only interested in
6220  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6221  *
6222  * We can't allocate space until we know how big the compiled form will be,
6223  * but we can't compile it (and thus know how big it is) until we've got a
6224  * place to put the code.  So we cheat:  we compile it twice, once with code
6225  * generation turned off and size counting turned on, and once "for real".
6226  * This also means that we don't allocate space until we are sure that the
6227  * thing really will compile successfully, and we never have to move the
6228  * code and thus invalidate pointers into it.  (Note that it has to be in
6229  * one piece because free() must be able to free it all.) [NB: not true in perl]
6230  *
6231  * Beware that the optimization-preparation code in here knows about some
6232  * of the structure of the compiled regexp.  [I'll say.]
6233  */
6234
6235 REGEXP *
6236 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6237                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6238                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6239 {
6240     REGEXP *rx;
6241     struct regexp *r;
6242     regexp_internal *ri;
6243     STRLEN plen;
6244     char *exp;
6245     regnode *scan;
6246     I32 flags;
6247     SSize_t minlen = 0;
6248     U32 rx_flags;
6249     SV *pat;
6250     SV *code_blocksv = NULL;
6251     SV** new_patternp = patternp;
6252
6253     /* these are all flags - maybe they should be turned
6254      * into a single int with different bit masks */
6255     I32 sawlookahead = 0;
6256     I32 sawplus = 0;
6257     I32 sawopen = 0;
6258     I32 sawminmod = 0;
6259
6260     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6261     bool recompile = 0;
6262     bool runtime_code = 0;
6263     scan_data_t data;
6264     RExC_state_t RExC_state;
6265     RExC_state_t * const pRExC_state = &RExC_state;
6266 #ifdef TRIE_STUDY_OPT
6267     int restudied = 0;
6268     RExC_state_t copyRExC_state;
6269 #endif
6270     GET_RE_DEBUG_FLAGS_DECL;
6271
6272     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6273
6274     DEBUG_r(if (!PL_colorset) reginitcolors());
6275
6276 #ifndef PERL_IN_XSUB_RE
6277     /* Initialize these here instead of as-needed, as is quick and avoids
6278      * having to test them each time otherwise */
6279     if (! PL_AboveLatin1) {
6280         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6281         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6282         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6283         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6284         PL_HasMultiCharFold =
6285                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6286     }
6287 #endif
6288
6289     pRExC_state->code_blocks = NULL;
6290     pRExC_state->num_code_blocks = 0;
6291
6292     if (is_bare_re)
6293         *is_bare_re = FALSE;
6294
6295     if (expr && (expr->op_type == OP_LIST ||
6296                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6297         /* allocate code_blocks if needed */
6298         OP *o;
6299         int ncode = 0;
6300
6301         for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6302             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6303                 ncode++; /* count of DO blocks */
6304         if (ncode) {
6305             pRExC_state->num_code_blocks = ncode;
6306             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6307         }
6308     }
6309
6310     if (!pat_count) {
6311         /* compile-time pattern with just OP_CONSTs and DO blocks */
6312
6313         int n;
6314         OP *o;
6315
6316         /* find how many CONSTs there are */
6317         assert(expr);
6318         n = 0;
6319         if (expr->op_type == OP_CONST)
6320             n = 1;
6321         else
6322             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6323                 if (o->op_type == OP_CONST)
6324                     n++;
6325             }
6326
6327         /* fake up an SV array */
6328
6329         assert(!new_patternp);
6330         Newx(new_patternp, n, SV*);
6331         SAVEFREEPV(new_patternp);
6332         pat_count = n;
6333
6334         n = 0;
6335         if (expr->op_type == OP_CONST)
6336             new_patternp[n] = cSVOPx_sv(expr);
6337         else
6338             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6339                 if (o->op_type == OP_CONST)
6340                     new_patternp[n++] = cSVOPo_sv;
6341             }
6342
6343     }
6344
6345     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6346         "Assembling pattern from %d elements%s\n", pat_count,
6347             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6348
6349     /* set expr to the first arg op */
6350
6351     if (pRExC_state->num_code_blocks
6352          && expr->op_type != OP_CONST)
6353     {
6354             expr = cLISTOPx(expr)->op_first;
6355             assert(   expr->op_type == OP_PUSHMARK
6356                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6357                    || expr->op_type == OP_PADRANGE);
6358             expr = OP_SIBLING(expr);
6359     }
6360
6361     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6362                         expr, &recompile, NULL);
6363
6364     /* handle bare (possibly after overloading) regex: foo =~ $re */
6365     {
6366         SV *re = pat;
6367         if (SvROK(re))
6368             re = SvRV(re);
6369         if (SvTYPE(re) == SVt_REGEXP) {
6370             if (is_bare_re)
6371                 *is_bare_re = TRUE;
6372             SvREFCNT_inc(re);
6373             Safefree(pRExC_state->code_blocks);
6374             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6375                 "Precompiled pattern%s\n",
6376                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6377
6378             return (REGEXP*)re;
6379         }
6380     }
6381
6382     exp = SvPV_nomg(pat, plen);
6383
6384     if (!eng->op_comp) {
6385         if ((SvUTF8(pat) && IN_BYTES)
6386                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6387         {
6388             /* make a temporary copy; either to convert to bytes,
6389              * or to avoid repeating get-magic / overloaded stringify */
6390             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6391                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6392         }
6393         Safefree(pRExC_state->code_blocks);
6394         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6395     }
6396
6397     /* ignore the utf8ness if the pattern is 0 length */
6398     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6399     RExC_uni_semantics = 0;
6400     RExC_contains_locale = 0;
6401     RExC_contains_i = 0;
6402     pRExC_state->runtime_code_qr = NULL;
6403
6404     DEBUG_COMPILE_r({
6405             SV *dsv= sv_newmortal();
6406             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6407             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6408                           PL_colors[4],PL_colors[5],s);
6409         });
6410
6411   redo_first_pass:
6412     /* we jump here if we upgrade the pattern to utf8 and have to
6413      * recompile */
6414
6415     if ((pm_flags & PMf_USE_RE_EVAL)
6416                 /* this second condition covers the non-regex literal case,
6417                  * i.e.  $foo =~ '(?{})'. */
6418                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6419     )
6420         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6421
6422     /* return old regex if pattern hasn't changed */
6423     /* XXX: note in the below we have to check the flags as well as the
6424      * pattern.
6425      *
6426      * Things get a touch tricky as we have to compare the utf8 flag
6427      * independently from the compile flags.  */
6428
6429     if (   old_re
6430         && !recompile
6431         && !!RX_UTF8(old_re) == !!RExC_utf8
6432         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6433         && RX_PRECOMP(old_re)
6434         && RX_PRELEN(old_re) == plen
6435         && memEQ(RX_PRECOMP(old_re), exp, plen)
6436         && !runtime_code /* with runtime code, always recompile */ )
6437     {
6438         Safefree(pRExC_state->code_blocks);
6439         return old_re;
6440     }
6441
6442     rx_flags = orig_rx_flags;
6443
6444     if (rx_flags & PMf_FOLD) {
6445         RExC_contains_i = 1;
6446     }
6447     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6448
6449         /* Set to use unicode semantics if the pattern is in utf8 and has the
6450          * 'depends' charset specified, as it means unicode when utf8  */
6451         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6452     }
6453
6454     RExC_precomp = exp;
6455     RExC_flags = rx_flags;
6456     RExC_pm_flags = pm_flags;
6457
6458     if (runtime_code) {
6459         if (TAINTING_get && TAINT_get)
6460             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6461
6462         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6463             /* whoops, we have a non-utf8 pattern, whilst run-time code
6464              * got compiled as utf8. Try again with a utf8 pattern */
6465             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6466                                     pRExC_state->num_code_blocks);
6467             goto redo_first_pass;
6468         }
6469     }
6470     assert(!pRExC_state->runtime_code_qr);
6471
6472     RExC_sawback = 0;
6473
6474     RExC_seen = 0;
6475     RExC_maxlen = 0;
6476     RExC_in_lookbehind = 0;
6477     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6478     RExC_extralen = 0;
6479     RExC_override_recoding = 0;
6480     RExC_in_multi_char_class = 0;
6481
6482     /* First pass: determine size, legality. */
6483     RExC_parse = exp;
6484     RExC_start = exp;
6485     RExC_end = exp + plen;
6486     RExC_naughty = 0;
6487     RExC_npar = 1;
6488     RExC_nestroot = 0;
6489     RExC_size = 0L;
6490     RExC_emit = (regnode *) &RExC_emit_dummy;
6491     RExC_whilem_seen = 0;
6492     RExC_open_parens = NULL;
6493     RExC_close_parens = NULL;
6494     RExC_opend = NULL;
6495     RExC_paren_names = NULL;
6496 #ifdef DEBUGGING
6497     RExC_paren_name_list = NULL;
6498 #endif
6499     RExC_recurse = NULL;
6500     RExC_study_chunk_recursed = NULL;
6501     RExC_study_chunk_recursed_bytes= 0;
6502     RExC_recurse_count = 0;
6503     pRExC_state->code_index = 0;
6504
6505 #if 0 /* REGC() is (currently) a NOP at the first pass.
6506        * Clever compilers notice this and complain. --jhi */
6507     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6508 #endif
6509     DEBUG_PARSE_r(
6510         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6511         RExC_lastnum=0;
6512         RExC_lastparse=NULL;
6513     );
6514     /* reg may croak on us, not giving us a chance to free
6515        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6516        need it to survive as long as the regexp (qr/(?{})/).
6517        We must check that code_blocksv is not already set, because we may
6518        have jumped back to restart the sizing pass. */
6519     if (pRExC_state->code_blocks && !code_blocksv) {
6520         code_blocksv = newSV_type(SVt_PV);
6521         SAVEFREESV(code_blocksv);
6522         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6523         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6524     }
6525     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6526         /* It's possible to write a regexp in ascii that represents Unicode
6527         codepoints outside of the byte range, such as via \x{100}. If we
6528         detect such a sequence we have to convert the entire pattern to utf8
6529         and then recompile, as our sizing calculation will have been based
6530         on 1 byte == 1 character, but we will need to use utf8 to encode
6531         at least some part of the pattern, and therefore must convert the whole
6532         thing.
6533         -- dmq */
6534         if (flags & RESTART_UTF8) {
6535             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6536                                     pRExC_state->num_code_blocks);
6537             goto redo_first_pass;
6538         }
6539         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6540     }
6541     if (code_blocksv)
6542         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6543
6544     DEBUG_PARSE_r({
6545         PerlIO_printf(Perl_debug_log,
6546             "Required size %"IVdf" nodes\n"
6547             "Starting second pass (creation)\n",
6548             (IV)RExC_size);
6549         RExC_lastnum=0;
6550         RExC_lastparse=NULL;
6551     });
6552
6553     /* The first pass could have found things that force Unicode semantics */
6554     if ((RExC_utf8 || RExC_uni_semantics)
6555          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6556     {
6557         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6558     }
6559
6560     /* Small enough for pointer-storage convention?
6561        If extralen==0, this means that we will not need long jumps. */
6562     if (RExC_size >= 0x10000L && RExC_extralen)
6563         RExC_size += RExC_extralen;
6564     else
6565         RExC_extralen = 0;
6566     if (RExC_whilem_seen > 15)
6567         RExC_whilem_seen = 15;
6568
6569     /* Allocate space and zero-initialize. Note, the two step process
6570        of zeroing when in debug mode, thus anything assigned has to
6571        happen after that */
6572     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6573     r = ReANY(rx);
6574     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6575          char, regexp_internal);
6576     if ( r == NULL || ri == NULL )
6577         FAIL("Regexp out of space");
6578 #ifdef DEBUGGING
6579     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6580     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6581          char);
6582 #else
6583     /* bulk initialize base fields with 0. */
6584     Zero(ri, sizeof(regexp_internal), char);
6585 #endif
6586
6587     /* non-zero initialization begins here */
6588     RXi_SET( r, ri );
6589     r->engine= eng;
6590     r->extflags = rx_flags;
6591     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6592
6593     if (pm_flags & PMf_IS_QR) {
6594         ri->code_blocks = pRExC_state->code_blocks;
6595         ri->num_code_blocks = pRExC_state->num_code_blocks;
6596     }
6597     else
6598     {
6599         int n;
6600         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6601             if (pRExC_state->code_blocks[n].src_regex)
6602                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6603         SAVEFREEPV(pRExC_state->code_blocks);
6604     }
6605
6606     {
6607         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6608         bool has_charset = (get_regex_charset(r->extflags)
6609                                                     != REGEX_DEPENDS_CHARSET);
6610
6611         /* The caret is output if there are any defaults: if not all the STD
6612          * flags are set, or if no character set specifier is needed */
6613         bool has_default =
6614                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6615                     || ! has_charset);
6616         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6617                                                    == REG_RUN_ON_COMMENT_SEEN);
6618         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6619                             >> RXf_PMf_STD_PMMOD_SHIFT);
6620         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6621         char *p;
6622         /* Allocate for the worst case, which is all the std flags are turned
6623          * on.  If more precision is desired, we could do a population count of
6624          * the flags set.  This could be done with a small lookup table, or by
6625          * shifting, masking and adding, or even, when available, assembly
6626          * language for a machine-language population count.
6627          * We never output a minus, as all those are defaults, so are
6628          * covered by the caret */
6629         const STRLEN wraplen = plen + has_p + has_runon
6630             + has_default       /* If needs a caret */
6631
6632                 /* If needs a character set specifier */
6633             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6634             + (sizeof(STD_PAT_MODS) - 1)
6635             + (sizeof("(?:)") - 1);
6636
6637         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6638         r->xpv_len_u.xpvlenu_pv = p;
6639         if (RExC_utf8)
6640             SvFLAGS(rx) |= SVf_UTF8;
6641         *p++='('; *p++='?';
6642
6643         /* If a default, cover it using the caret */
6644         if (has_default) {
6645             *p++= DEFAULT_PAT_MOD;
6646         }
6647         if (has_charset) {
6648             STRLEN len;
6649             const char* const name = get_regex_charset_name(r->extflags, &len);
6650             Copy(name, p, len, char);
6651             p += len;
6652         }
6653         if (has_p)
6654             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6655         {
6656             char ch;
6657             while((ch = *fptr++)) {
6658                 if(reganch & 1)
6659                     *p++ = ch;
6660                 reganch >>= 1;
6661             }
6662         }
6663
6664         *p++ = ':';
6665         Copy(RExC_precomp, p, plen, char);
6666         assert ((RX_WRAPPED(rx) - p) < 16);
6667         r->pre_prefix = p - RX_WRAPPED(rx);
6668         p += plen;
6669         if (has_runon)
6670             *p++ = '\n';
6671         *p++ = ')';
6672         *p = 0;
6673         SvCUR_set(rx, p - RX_WRAPPED(rx));
6674     }
6675
6676     r->intflags = 0;
6677     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6678
6679     /* setup various meta data about recursion, this all requires
6680      * RExC_npar to be correctly set, and a bit later on we clear it */
6681     if (RExC_seen & REG_RECURSE_SEEN) {
6682         Newxz(RExC_open_parens, RExC_npar,regnode *);
6683         SAVEFREEPV(RExC_open_parens);
6684         Newxz(RExC_close_parens,RExC_npar,regnode *);
6685         SAVEFREEPV(RExC_close_parens);
6686     }
6687     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6688         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6689          * So its 1 if there are no parens. */
6690         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6691                                          ((RExC_npar & 0x07) != 0);
6692         Newx(RExC_study_chunk_recursed,
6693              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6694         SAVEFREEPV(RExC_study_chunk_recursed);
6695     }
6696
6697     /* Useful during FAIL. */
6698 #ifdef RE_TRACK_PATTERN_OFFSETS
6699     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6700     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6701                           "%s %"UVuf" bytes for offset annotations.\n",
6702                           ri->u.offsets ? "Got" : "Couldn't get",
6703                           (UV)((2*RExC_size+1) * sizeof(U32))));
6704 #endif
6705     SetProgLen(ri,RExC_size);
6706     RExC_rx_sv = rx;
6707     RExC_rx = r;
6708     RExC_rxi = ri;
6709
6710     /* Second pass: emit code. */
6711     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6712     RExC_pm_flags = pm_flags;
6713     RExC_parse = exp;
6714     RExC_end = exp + plen;
6715     RExC_naughty = 0;
6716     RExC_npar = 1;
6717     RExC_emit_start = ri->program;
6718     RExC_emit = ri->program;
6719     RExC_emit_bound = ri->program + RExC_size + 1;
6720     pRExC_state->code_index = 0;
6721
6722     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6723     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6724         ReREFCNT_dec(rx);
6725         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6726     }
6727     /* XXXX To minimize changes to RE engine we always allocate
6728        3-units-long substrs field. */
6729     Newx(r->substrs, 1, struct reg_substr_data);
6730     if (RExC_recurse_count) {
6731         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6732         SAVEFREEPV(RExC_recurse);
6733     }
6734
6735 reStudy:
6736     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6737     Zero(r->substrs, 1, struct reg_substr_data);
6738     if (RExC_study_chunk_recursed)
6739         Zero(RExC_study_chunk_recursed,
6740              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6741
6742 #ifdef TRIE_STUDY_OPT
6743     if (!restudied) {
6744         StructCopy(&zero_scan_data, &data, scan_data_t);
6745         copyRExC_state = RExC_state;
6746     } else {
6747         U32 seen=RExC_seen;
6748         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6749
6750         RExC_state = copyRExC_state;
6751         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6752             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6753         else
6754             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6755         StructCopy(&zero_scan_data, &data, scan_data_t);
6756     }
6757 #else
6758     StructCopy(&zero_scan_data, &data, scan_data_t);
6759 #endif
6760
6761     /* Dig out information for optimizations. */
6762     r->extflags = RExC_flags; /* was pm_op */
6763     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6764
6765     if (UTF)
6766         SvUTF8_on(rx);  /* Unicode in it? */
6767     ri->regstclass = NULL;
6768     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6769         r->intflags |= PREGf_NAUGHTY;
6770     scan = ri->program + 1;             /* First BRANCH. */
6771
6772     /* testing for BRANCH here tells us whether there is "must appear"
6773        data in the pattern. If there is then we can use it for optimisations */
6774     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6775                                                   */
6776         SSize_t fake;
6777         STRLEN longest_float_length, longest_fixed_length;
6778         regnode_ssc ch_class; /* pointed to by data */
6779         int stclass_flag;
6780         SSize_t last_close = 0; /* pointed to by data */
6781         regnode *first= scan;
6782         regnode *first_next= regnext(first);
6783         /*
6784          * Skip introductions and multiplicators >= 1
6785          * so that we can extract the 'meat' of the pattern that must
6786          * match in the large if() sequence following.
6787          * NOTE that EXACT is NOT covered here, as it is normally
6788          * picked up by the optimiser separately.
6789          *
6790          * This is unfortunate as the optimiser isnt handling lookahead
6791          * properly currently.
6792          *
6793          */
6794         while ((OP(first) == OPEN && (sawopen = 1)) ||
6795                /* An OR of *one* alternative - should not happen now. */
6796             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6797             /* for now we can't handle lookbehind IFMATCH*/
6798             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6799             (OP(first) == PLUS) ||
6800             (OP(first) == MINMOD) ||
6801                /* An {n,m} with n>0 */
6802             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6803             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6804         {
6805                 /*
6806                  * the only op that could be a regnode is PLUS, all the rest
6807                  * will be regnode_1 or regnode_2.
6808                  *
6809                  * (yves doesn't think this is true)
6810                  */
6811                 if (OP(first) == PLUS)
6812                     sawplus = 1;
6813                 else {
6814                     if (OP(first) == MINMOD)
6815                         sawminmod = 1;
6816                     first += regarglen[OP(first)];
6817                 }
6818                 first = NEXTOPER(first);
6819                 first_next= regnext(first);
6820         }
6821
6822         /* Starting-point info. */
6823       again:
6824         DEBUG_PEEP("first:",first,0);
6825         /* Ignore EXACT as we deal with it later. */
6826         if (PL_regkind[OP(first)] == EXACT) {
6827             if (OP(first) == EXACT)
6828                 NOOP;   /* Empty, get anchored substr later. */
6829             else
6830                 ri->regstclass = first;
6831         }
6832 #ifdef TRIE_STCLASS
6833         else if (PL_regkind[OP(first)] == TRIE &&
6834                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6835         {
6836             /* this can happen only on restudy */
6837             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6838         }
6839 #endif
6840         else if (REGNODE_SIMPLE(OP(first)))
6841             ri->regstclass = first;
6842         else if (PL_regkind[OP(first)] == BOUND ||
6843                  PL_regkind[OP(first)] == NBOUND)
6844             ri->regstclass = first;
6845         else if (PL_regkind[OP(first)] == BOL) {
6846             r->intflags |= (OP(first) == MBOL
6847                            ? PREGf_ANCH_MBOL
6848                            : (OP(first) == SBOL
6849                               ? PREGf_ANCH_SBOL
6850                               : PREGf_ANCH_BOL));
6851             first = NEXTOPER(first);
6852             goto again;
6853         }
6854         else if (OP(first) == GPOS) {
6855             r->intflags |= PREGf_ANCH_GPOS;
6856             first = NEXTOPER(first);
6857             goto again;
6858         }
6859         else if ((!sawopen || !RExC_sawback) &&
6860             !sawlookahead &&
6861             (OP(first) == STAR &&
6862             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6863             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6864         {
6865             /* turn .* into ^.* with an implied $*=1 */
6866             const int type =
6867                 (OP(NEXTOPER(first)) == REG_ANY)
6868                     ? PREGf_ANCH_MBOL
6869                     : PREGf_ANCH_SBOL;
6870             r->intflags |= (type | PREGf_IMPLICIT);
6871             first = NEXTOPER(first);
6872             goto again;
6873         }
6874         if (sawplus && !sawminmod && !sawlookahead
6875             && (!sawopen || !RExC_sawback)
6876             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6877             /* x+ must match at the 1st pos of run of x's */
6878             r->intflags |= PREGf_SKIP;
6879
6880         /* Scan is after the zeroth branch, first is atomic matcher. */
6881 #ifdef TRIE_STUDY_OPT
6882         DEBUG_PARSE_r(
6883             if (!restudied)
6884                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6885                               (IV)(first - scan + 1))
6886         );
6887 #else
6888         DEBUG_PARSE_r(
6889             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6890                 (IV)(first - scan + 1))
6891         );
6892 #endif
6893
6894
6895         /*
6896         * If there's something expensive in the r.e., find the
6897         * longest literal string that must appear and make it the
6898         * regmust.  Resolve ties in favor of later strings, since
6899         * the regstart check works with the beginning of the r.e.
6900         * and avoiding duplication strengthens checking.  Not a
6901         * strong reason, but sufficient in the absence of others.
6902         * [Now we resolve ties in favor of the earlier string if
6903         * it happens that c_offset_min has been invalidated, since the
6904         * earlier string may buy us something the later one won't.]
6905         */
6906
6907         data.longest_fixed = newSVpvs("");
6908         data.longest_float = newSVpvs("");
6909         data.last_found = newSVpvs("");
6910         data.longest = &(data.longest_fixed);
6911         ENTER_with_name("study_chunk");
6912         SAVEFREESV(data.longest_fixed);
6913         SAVEFREESV(data.longest_float);
6914         SAVEFREESV(data.last_found);
6915         first = scan;
6916         if (!ri->regstclass) {
6917             ssc_init(pRExC_state, &ch_class);
6918             data.start_class = &ch_class;
6919             stclass_flag = SCF_DO_STCLASS_AND;
6920         } else                          /* XXXX Check for BOUND? */
6921             stclass_flag = 0;
6922         data.last_closep = &last_close;
6923
6924         DEBUG_RExC_seen();
6925         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6926                              scan + RExC_size, /* Up to end */
6927             &data, -1, 0, NULL,
6928             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6929                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6930             0);
6931
6932
6933         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6934
6935
6936         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6937              && data.last_start_min == 0 && data.last_end > 0
6938              && !RExC_seen_zerolen
6939              && !(RExC_seen & REG_VERBARG_SEEN)
6940              && !(RExC_seen & REG_GPOS_SEEN)
6941         ){
6942             r->extflags |= RXf_CHECK_ALL;
6943         }
6944         scan_commit(pRExC_state, &data,&minlen,0);
6945
6946         longest_float_length = CHR_SVLEN(data.longest_float);
6947
6948         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6949                    && data.offset_fixed == data.offset_float_min
6950                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6951             && S_setup_longest (aTHX_ pRExC_state,
6952                                     data.longest_float,
6953                                     &(r->float_utf8),
6954                                     &(r->float_substr),
6955                                     &(r->float_end_shift),
6956                                     data.lookbehind_float,
6957                                     data.offset_float_min,
6958                                     data.minlen_float,
6959                                     longest_float_length,
6960                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6961                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6962         {
6963             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6964             r->float_max_offset = data.offset_float_max;
6965             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6966                 r->float_max_offset -= data.lookbehind_float;
6967             SvREFCNT_inc_simple_void_NN(data.longest_float);
6968         }
6969         else {
6970             r->float_substr = r->float_utf8 = NULL;
6971             longest_float_length = 0;
6972         }
6973
6974         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6975
6976         if (S_setup_longest (aTHX_ pRExC_state,
6977                                 data.longest_fixed,
6978                                 &(r->anchored_utf8),
6979                                 &(r->anchored_substr),
6980                                 &(r->anchored_end_shift),
6981                                 data.lookbehind_fixed,
6982                                 data.offset_fixed,
6983                                 data.minlen_fixed,
6984                                 longest_fixed_length,
6985                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6986                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6987         {
6988             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6989             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6990         }
6991         else {
6992             r->anchored_substr = r->anchored_utf8 = NULL;
6993             longest_fixed_length = 0;
6994         }
6995         LEAVE_with_name("study_chunk");
6996
6997         if (ri->regstclass
6998             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6999             ri->regstclass = NULL;
7000
7001         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7002             && stclass_flag
7003             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7004             && !ssc_is_anything(data.start_class))
7005         {
7006             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7007
7008             ssc_finalize(pRExC_state, data.start_class);
7009
7010             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7011             StructCopy(data.start_class,
7012                        (regnode_ssc*)RExC_rxi->data->data[n],
7013                        regnode_ssc);
7014             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7015             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7016             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7017                       regprop(r, sv, (regnode*)data.start_class, NULL);
7018                       PerlIO_printf(Perl_debug_log,
7019                                     "synthetic stclass \"%s\".\n",
7020                                     SvPVX_const(sv));});
7021             data.start_class = NULL;
7022         }
7023
7024         /* A temporary algorithm prefers floated substr to fixed one to dig
7025          * more info. */
7026         if (longest_fixed_length > longest_float_length) {
7027             r->substrs->check_ix = 0;
7028             r->check_end_shift = r->anchored_end_shift;
7029             r->check_substr = r->anchored_substr;
7030             r->check_utf8 = r->anchored_utf8;
7031             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7032             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7033                 r->intflags |= PREGf_NOSCAN;
7034         }
7035         else {
7036             r->substrs->check_ix = 1;
7037             r->check_end_shift = r->float_end_shift;
7038             r->check_substr = r->float_substr;
7039             r->check_utf8 = r->float_utf8;
7040             r->check_offset_min = r->float_min_offset;
7041             r->check_offset_max = r->float_max_offset;
7042         }
7043         if ((r->check_substr || r->check_utf8) ) {
7044             r->extflags |= RXf_USE_INTUIT;
7045             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7046                 r->extflags |= RXf_INTUIT_TAIL;
7047         }
7048         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7049
7050         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7051         if ( (STRLEN)minlen < longest_float_length )
7052             minlen= longest_float_length;
7053         if ( (STRLEN)minlen < longest_fixed_length )
7054             minlen= longest_fixed_length;
7055         */
7056     }
7057     else {
7058         /* Several toplevels. Best we can is to set minlen. */
7059         SSize_t fake;
7060         regnode_ssc ch_class;
7061         SSize_t last_close = 0;
7062
7063         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7064
7065         scan = ri->program + 1;
7066         ssc_init(pRExC_state, &ch_class);
7067         data.start_class = &ch_class;
7068         data.last_closep = &last_close;
7069
7070         DEBUG_RExC_seen();
7071         minlen = study_chunk(pRExC_state,
7072             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7073             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7074                                                       ? SCF_TRIE_DOING_RESTUDY
7075                                                       : 0),
7076             0);
7077
7078         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7079
7080         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7081                 = r->float_substr = r->float_utf8 = NULL;
7082
7083         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7084             && ! ssc_is_anything(data.start_class))
7085         {
7086             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7087
7088             ssc_finalize(pRExC_state, data.start_class);
7089
7090             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7091             StructCopy(data.start_class,
7092                        (regnode_ssc*)RExC_rxi->data->data[n],
7093                        regnode_ssc);
7094             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7095             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7096             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7097                       regprop(r, sv, (regnode*)data.start_class, NULL);
7098                       PerlIO_printf(Perl_debug_log,
7099                                     "synthetic stclass \"%s\".\n",
7100                                     SvPVX_const(sv));});
7101             data.start_class = NULL;
7102         }
7103     }
7104
7105     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7106         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7107         r->maxlen = REG_INFTY;
7108     }
7109     else {
7110         r->maxlen = RExC_maxlen;
7111     }
7112
7113     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7114        the "real" pattern. */
7115     DEBUG_OPTIMISE_r({
7116         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7117                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7118     });
7119     r->minlenret = minlen;
7120     if (r->minlen < minlen)
7121         r->minlen = minlen;
7122
7123     if (RExC_seen & REG_GPOS_SEEN)
7124         r->intflags |= PREGf_GPOS_SEEN;
7125     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7126         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7127                                                 lookbehind */
7128     if (pRExC_state->num_code_blocks)
7129         r->extflags |= RXf_EVAL_SEEN;
7130     if (RExC_seen & REG_CANY_SEEN)
7131         r->intflags |= PREGf_CANY_SEEN;
7132     if (RExC_seen & REG_VERBARG_SEEN)
7133     {
7134         r->intflags |= PREGf_VERBARG_SEEN;
7135         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7136     }
7137     if (RExC_seen & REG_CUTGROUP_SEEN)
7138         r->intflags |= PREGf_CUTGROUP_SEEN;
7139     if (pm_flags & PMf_USE_RE_EVAL)
7140         r->intflags |= PREGf_USE_RE_EVAL;
7141     if (RExC_paren_names)
7142         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7143     else
7144         RXp_PAREN_NAMES(r) = NULL;
7145
7146     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7147      * so it can be used in pp.c */
7148     if (r->intflags & PREGf_ANCH)
7149         r->extflags |= RXf_IS_ANCHORED;
7150
7151
7152     {
7153         /* this is used to identify "special" patterns that might result
7154          * in Perl NOT calling the regex engine and instead doing the match "itself",
7155          * particularly special cases in split//. By having the regex compiler
7156          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7157          * we avoid weird issues with equivalent patterns resulting in different behavior,
7158          * AND we allow non Perl engines to get the same optimizations by the setting the
7159          * flags appropriately - Yves */
7160         regnode *first = ri->program + 1;
7161         U8 fop = OP(first);
7162         regnode *next = NEXTOPER(first);
7163         U8 nop = OP(next);
7164
7165         if (PL_regkind[fop] == NOTHING && nop == END)
7166             r->extflags |= RXf_NULL;
7167         else if (PL_regkind[fop] == BOL && nop == END)
7168             r->extflags |= RXf_START_ONLY;
7169         else if (fop == PLUS
7170                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7171                  && OP(regnext(first)) == END)
7172             r->extflags |= RXf_WHITE;
7173         else if ( r->extflags & RXf_SPLIT
7174                   && fop == EXACT
7175                   && STR_LEN(first) == 1
7176                   && *(STRING(first)) == ' '
7177                   && OP(regnext(first)) == END )
7178             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7179
7180     }
7181
7182     if (RExC_contains_locale) {
7183         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7184     }
7185
7186 #ifdef DEBUGGING
7187     if (RExC_paren_names) {
7188         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7189         ri->data->data[ri->name_list_idx]
7190                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7191     } else
7192 #endif
7193         ri->name_list_idx = 0;
7194
7195     if (RExC_recurse_count) {
7196         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7197             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7198             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7199         }
7200     }
7201     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7202     /* assume we don't need to swap parens around before we match */
7203
7204     DEBUG_DUMP_r({
7205         DEBUG_RExC_seen();
7206         PerlIO_printf(Perl_debug_log,"Final program:\n");
7207         regdump(r);
7208     });
7209 #ifdef RE_TRACK_PATTERN_OFFSETS
7210     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7211         const STRLEN len = ri->u.offsets[0];
7212         STRLEN i;
7213         GET_RE_DEBUG_FLAGS_DECL;
7214         PerlIO_printf(Perl_debug_log,
7215                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7216         for (i = 1; i <= len; i++) {
7217             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7218                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7219                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7220             }
7221         PerlIO_printf(Perl_debug_log, "\n");
7222     });
7223 #endif
7224
7225 #ifdef USE_ITHREADS
7226     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7227      * by setting the regexp SV to readonly-only instead. If the
7228      * pattern's been recompiled, the USEDness should remain. */
7229     if (old_re && SvREADONLY(old_re))
7230         SvREADONLY_on(rx);
7231 #endif
7232     return rx;
7233 }
7234
7235
7236 SV*
7237 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7238                     const U32 flags)
7239 {
7240     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7241
7242     PERL_UNUSED_ARG(value);
7243
7244     if (flags & RXapif_FETCH) {
7245         return reg_named_buff_fetch(rx, key, flags);
7246     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7247         Perl_croak_no_modify();
7248         return NULL;
7249     } else if (flags & RXapif_EXISTS) {
7250         return reg_named_buff_exists(rx, key, flags)
7251             ? &PL_sv_yes
7252             : &PL_sv_no;
7253     } else if (flags & RXapif_REGNAMES) {
7254         return reg_named_buff_all(rx, flags);
7255     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7256         return reg_named_buff_scalar(rx, flags);
7257     } else {
7258         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7259         return NULL;
7260     }
7261 }
7262
7263 SV*
7264 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7265                          const U32 flags)
7266 {
7267     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7268     PERL_UNUSED_ARG(lastkey);
7269
7270     if (flags & RXapif_FIRSTKEY)
7271         return reg_named_buff_firstkey(rx, flags);
7272     else if (flags & RXapif_NEXTKEY)
7273         return reg_named_buff_nextkey(rx, flags);
7274     else {
7275         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7276                                             (int)flags);
7277         return NULL;
7278     }
7279 }
7280
7281 SV*
7282 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7283                           const U32 flags)
7284 {
7285     AV *retarray = NULL;
7286     SV *ret;
7287     struct regexp *const rx = ReANY(r);
7288
7289     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7290
7291     if (flags & RXapif_ALL)
7292         retarray=newAV();
7293
7294     if (rx && RXp_PAREN_NAMES(rx)) {
7295         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7296         if (he_str) {
7297             IV i;
7298             SV* sv_dat=HeVAL(he_str);
7299             I32 *nums=(I32*)SvPVX(sv_dat);
7300             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7301                 if ((I32)(rx->nparens) >= nums[i]
7302                     && rx->offs[nums[i]].start != -1
7303                     && rx->offs[nums[i]].end != -1)
7304                 {
7305                     ret = newSVpvs("");
7306                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7307                     if (!retarray)
7308                         return ret;
7309                 } else {
7310                     if (retarray)
7311                         ret = newSVsv(&PL_sv_undef);
7312                 }
7313                 if (retarray)
7314                     av_push(retarray, ret);
7315             }
7316             if (retarray)
7317                 return newRV_noinc(MUTABLE_SV(retarray));
7318         }
7319     }
7320     return NULL;
7321 }
7322
7323 bool
7324 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7325                            const U32 flags)
7326 {
7327     struct regexp *const rx = ReANY(r);
7328
7329     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7330
7331     if (rx && RXp_PAREN_NAMES(rx)) {
7332         if (flags & RXapif_ALL) {
7333             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7334         } else {
7335             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7336             if (sv) {
7337                 SvREFCNT_dec_NN(sv);
7338                 return TRUE;
7339             } else {
7340                 return FALSE;
7341             }
7342         }
7343     } else {
7344         return FALSE;
7345     }
7346 }
7347
7348 SV*
7349 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7350 {
7351     struct regexp *const rx = ReANY(r);
7352
7353     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7354
7355     if ( rx && RXp_PAREN_NAMES(rx) ) {
7356         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7357
7358         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7359     } else {
7360         return FALSE;
7361     }
7362 }
7363
7364 SV*
7365 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7366 {
7367     struct regexp *const rx = ReANY(r);
7368     GET_RE_DEBUG_FLAGS_DECL;
7369
7370     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7371
7372     if (rx && RXp_PAREN_NAMES(rx)) {
7373         HV *hv = RXp_PAREN_NAMES(rx);
7374         HE *temphe;
7375         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7376             IV i;
7377             IV parno = 0;
7378             SV* sv_dat = HeVAL(temphe);
7379             I32 *nums = (I32*)SvPVX(sv_dat);
7380             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7381                 if ((I32)(rx->lastparen) >= nums[i] &&
7382                     rx->offs[nums[i]].start != -1 &&
7383                     rx->offs[nums[i]].end != -1)
7384                 {
7385                     parno = nums[i];
7386                     break;
7387                 }
7388             }
7389             if (parno || flags & RXapif_ALL) {
7390                 return newSVhek(HeKEY_hek(temphe));
7391             }
7392         }
7393     }
7394     return NULL;
7395 }
7396
7397 SV*
7398 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7399 {
7400     SV *ret;
7401     AV *av;
7402     SSize_t length;
7403     struct regexp *const rx = ReANY(r);
7404
7405     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7406
7407     if (rx && RXp_PAREN_NAMES(rx)) {
7408         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7409             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7410         } else if (flags & RXapif_ONE) {
7411             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7412             av = MUTABLE_AV(SvRV(ret));
7413             length = av_tindex(av);
7414             SvREFCNT_dec_NN(ret);
7415             return newSViv(length + 1);
7416         } else {
7417             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7418                                                 (int)flags);
7419             return NULL;
7420         }
7421     }
7422     return &PL_sv_undef;
7423 }
7424
7425 SV*
7426 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7427 {
7428     struct regexp *const rx = ReANY(r);
7429     AV *av = newAV();
7430
7431     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7432
7433     if (rx && RXp_PAREN_NAMES(rx)) {
7434         HV *hv= RXp_PAREN_NAMES(rx);
7435         HE *temphe;
7436         (void)hv_iterinit(hv);
7437         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7438             IV i;
7439             IV parno = 0;
7440             SV* sv_dat = HeVAL(temphe);
7441             I32 *nums = (I32*)SvPVX(sv_dat);
7442             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7443                 if ((I32)(rx->lastparen) >= nums[i] &&
7444                     rx->offs[nums[i]].start != -1 &&
7445                     rx->offs[nums[i]].end != -1)
7446                 {
7447                     parno = nums[i];
7448                     break;
7449                 }
7450             }
7451             if (parno || flags & RXapif_ALL) {
7452                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7453             }
7454         }
7455     }
7456
7457     return newRV_noinc(MUTABLE_SV(av));
7458 }
7459
7460 void
7461 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7462                              SV * const sv)
7463 {
7464     struct regexp *const rx = ReANY(r);
7465     char *s = NULL;
7466     SSize_t i = 0;
7467     SSize_t s1, t1;
7468     I32 n = paren;
7469
7470     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7471
7472     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7473            || n == RX_BUFF_IDX_CARET_FULLMATCH
7474            || n == RX_BUFF_IDX_CARET_POSTMATCH
7475        )
7476     {
7477         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7478         if (!keepcopy) {
7479             /* on something like
7480              *    $r = qr/.../;
7481              *    /$qr/p;
7482              * the KEEPCOPY is set on the PMOP rather than the regex */
7483             if (PL_curpm && r == PM_GETRE(PL_curpm))
7484                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7485         }
7486         if (!keepcopy)
7487             goto ret_undef;
7488     }
7489
7490     if (!rx->subbeg)
7491         goto ret_undef;
7492
7493     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7494         /* no need to distinguish between them any more */
7495         n = RX_BUFF_IDX_FULLMATCH;
7496
7497     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7498         && rx->offs[0].start != -1)
7499     {
7500         /* $`, ${^PREMATCH} */
7501         i = rx->offs[0].start;
7502         s = rx->subbeg;
7503     }
7504     else
7505     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7506         && rx->offs[0].end != -1)
7507     {
7508         /* $', ${^POSTMATCH} */
7509         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7510         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7511     }
7512     else
7513     if ( 0 <= n && n <= (I32)rx->nparens &&
7514         (s1 = rx->offs[n].start) != -1 &&
7515         (t1 = rx->offs[n].end) != -1)
7516     {
7517         /* $&, ${^MATCH},  $1 ... */
7518         i = t1 - s1;
7519         s = rx->subbeg + s1 - rx->suboffset;
7520     } else {
7521         goto ret_undef;
7522     }
7523
7524     assert(s >= rx->subbeg);
7525     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7526     if (i >= 0) {
7527 #ifdef NO_TAINT_SUPPORT
7528         sv_setpvn(sv, s, i);
7529 #else
7530         const int oldtainted = TAINT_get;
7531         TAINT_NOT;
7532         sv_setpvn(sv, s, i);
7533         TAINT_set(oldtainted);
7534 #endif
7535         if ( (rx->intflags & PREGf_CANY_SEEN)
7536             ? (RXp_MATCH_UTF8(rx)
7537                         && (!i || is_utf8_string((U8*)s, i)))
7538             : (RXp_MATCH_UTF8(rx)) )
7539         {
7540             SvUTF8_on(sv);
7541         }
7542         else
7543             SvUTF8_off(sv);
7544         if (TAINTING_get) {
7545             if (RXp_MATCH_TAINTED(rx)) {
7546                 if (SvTYPE(sv) >= SVt_PVMG) {
7547                     MAGIC* const mg = SvMAGIC(sv);
7548                     MAGIC* mgt;
7549                     TAINT;
7550                     SvMAGIC_set(sv, mg->mg_moremagic);
7551                     SvTAINT(sv);
7552                     if ((mgt = SvMAGIC(sv))) {
7553                         mg->mg_moremagic = mgt;
7554                         SvMAGIC_set(sv, mg);
7555                     }
7556                 } else {
7557                     TAINT;
7558                     SvTAINT(sv);
7559                 }
7560             } else
7561                 SvTAINTED_off(sv);
7562         }
7563     } else {
7564       ret_undef:
7565         sv_setsv(sv,&PL_sv_undef);
7566         return;
7567     }
7568 }
7569
7570 void
7571 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7572                                                          SV const * const value)
7573 {
7574     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7575
7576     PERL_UNUSED_ARG(rx);
7577     PERL_UNUSED_ARG(paren);
7578     PERL_UNUSED_ARG(value);
7579
7580     if (!PL_localizing)
7581         Perl_croak_no_modify();
7582 }
7583
7584 I32
7585 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7586                               const I32 paren)
7587 {
7588     struct regexp *const rx = ReANY(r);
7589     I32 i;
7590     I32 s1, t1;
7591
7592     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7593
7594     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7595         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7596         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7597     )
7598     {
7599         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7600         if (!keepcopy) {
7601             /* on something like
7602              *    $r = qr/.../;
7603              *    /$qr/p;
7604              * the KEEPCOPY is set on the PMOP rather than the regex */
7605             if (PL_curpm && r == PM_GETRE(PL_curpm))
7606                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7607         }
7608         if (!keepcopy)
7609             goto warn_undef;
7610     }
7611
7612     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7613     switch (paren) {
7614       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7615       case RX_BUFF_IDX_PREMATCH:       /* $` */
7616         if (rx->offs[0].start != -1) {
7617                         i = rx->offs[0].start;
7618                         if (i > 0) {
7619                                 s1 = 0;
7620                                 t1 = i;
7621                                 goto getlen;
7622                         }
7623             }
7624         return 0;
7625
7626       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7627       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7628             if (rx->offs[0].end != -1) {
7629                         i = rx->sublen - rx->offs[0].end;
7630                         if (i > 0) {
7631                                 s1 = rx->offs[0].end;
7632                                 t1 = rx->sublen;
7633                                 goto getlen;
7634                         }
7635             }
7636         return 0;
7637
7638       default: /* $& / ${^MATCH}, $1, $2, ... */
7639             if (paren <= (I32)rx->nparens &&
7640             (s1 = rx->offs[paren].start) != -1 &&
7641             (t1 = rx->offs[paren].end) != -1)
7642             {
7643             i = t1 - s1;
7644             goto getlen;
7645         } else {
7646           warn_undef:
7647             if (ckWARN(WARN_UNINITIALIZED))
7648                 report_uninit((const SV *)sv);
7649             return 0;
7650         }
7651     }
7652   getlen:
7653     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7654         const char * const s = rx->subbeg - rx->suboffset + s1;
7655         const U8 *ep;
7656         STRLEN el;
7657
7658         i = t1 - s1;
7659         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7660                         i = el;
7661     }
7662     return i;
7663 }
7664
7665 SV*
7666 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7667 {
7668     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7669         PERL_UNUSED_ARG(rx);
7670         if (0)
7671             return NULL;
7672         else
7673             return newSVpvs("Regexp");
7674 }
7675
7676 /* Scans the name of a named buffer from the pattern.
7677  * If flags is REG_RSN_RETURN_NULL returns null.
7678  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7679  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7680  * to the parsed name as looked up in the RExC_paren_names hash.
7681  * If there is an error throws a vFAIL().. type exception.
7682  */
7683
7684 #define REG_RSN_RETURN_NULL    0
7685 #define REG_RSN_RETURN_NAME    1
7686 #define REG_RSN_RETURN_DATA    2
7687
7688 STATIC SV*
7689 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7690 {
7691     char *name_start = RExC_parse;
7692
7693     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7694
7695     assert (RExC_parse <= RExC_end);
7696     if (RExC_parse == RExC_end) NOOP;
7697     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7698          /* skip IDFIRST by using do...while */
7699         if (UTF)
7700             do {
7701                 RExC_parse += UTF8SKIP(RExC_parse);
7702             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7703         else
7704             do {
7705                 RExC_parse++;
7706             } while (isWORDCHAR(*RExC_parse));
7707     } else {
7708         RExC_parse++; /* so the <- from the vFAIL is after the offending
7709                          character */
7710         vFAIL("Group name must start with a non-digit word character");
7711     }
7712     if ( flags ) {
7713         SV* sv_name
7714             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7715                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7716         if ( flags == REG_RSN_RETURN_NAME)
7717             return sv_name;
7718         else if (flags==REG_RSN_RETURN_DATA) {
7719             HE *he_str = NULL;
7720             SV *sv_dat = NULL;
7721             if ( ! sv_name )      /* should not happen*/
7722                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7723             if (RExC_paren_names)
7724                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7725             if ( he_str )
7726                 sv_dat = HeVAL(he_str);
7727             if ( ! sv_dat )
7728                 vFAIL("Reference to nonexistent named group");
7729             return sv_dat;
7730         }
7731         else {
7732             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7733                        (unsigned long) flags);
7734         }
7735         assert(0); /* NOT REACHED */
7736     }
7737     return NULL;
7738 }
7739
7740 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7741     int rem=(int)(RExC_end - RExC_parse);                       \
7742     int cut;                                                    \
7743     int num;                                                    \
7744     int iscut=0;                                                \
7745     if (rem>10) {                                               \
7746         rem=10;                                                 \
7747         iscut=1;                                                \
7748     }                                                           \
7749     cut=10-rem;                                                 \
7750     if (RExC_lastparse!=RExC_parse)                             \
7751         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7752             rem, RExC_parse,                                    \
7753             cut + 4,                                            \
7754             iscut ? "..." : "<"                                 \
7755         );                                                      \
7756     else                                                        \
7757         PerlIO_printf(Perl_debug_log,"%16s","");                \
7758                                                                 \
7759     if (SIZE_ONLY)                                              \
7760        num = RExC_size + 1;                                     \
7761     else                                                        \
7762        num=REG_NODE_NUM(RExC_emit);                             \
7763     if (RExC_lastnum!=num)                                      \
7764        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7765     else                                                        \
7766        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7767     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7768         (int)((depth*2)), "",                                   \
7769         (funcname)                                              \
7770     );                                                          \
7771     RExC_lastnum=num;                                           \
7772     RExC_lastparse=RExC_parse;                                  \
7773 })
7774
7775
7776
7777 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7778     DEBUG_PARSE_MSG((funcname));                            \
7779     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7780 })
7781 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7782     DEBUG_PARSE_MSG((funcname));                            \
7783     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7784 })
7785
7786 /* This section of code defines the inversion list object and its methods.  The
7787  * interfaces are highly subject to change, so as much as possible is static to
7788  * this file.  An inversion list is here implemented as a malloc'd C UV array
7789  * as an SVt_INVLIST scalar.
7790  *
7791  * An inversion list for Unicode is an array of code points, sorted by ordinal
7792  * number.  The zeroth element is the first code point in the list.  The 1th
7793  * element is the first element beyond that not in the list.  In other words,
7794  * the first range is
7795  *  invlist[0]..(invlist[1]-1)
7796  * The other ranges follow.  Thus every element whose index is divisible by two
7797  * marks the beginning of a range that is in the list, and every element not
7798  * divisible by two marks the beginning of a range not in the list.  A single
7799  * element inversion list that contains the single code point N generally
7800  * consists of two elements
7801  *  invlist[0] == N
7802  *  invlist[1] == N+1
7803  * (The exception is when N is the highest representable value on the
7804  * machine, in which case the list containing just it would be a single
7805  * element, itself.  By extension, if the last range in the list extends to
7806  * infinity, then the first element of that range will be in the inversion list
7807  * at a position that is divisible by two, and is the final element in the
7808  * list.)
7809  * Taking the complement (inverting) an inversion list is quite simple, if the
7810  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7811  * This implementation reserves an element at the beginning of each inversion
7812  * list to always contain 0; there is an additional flag in the header which
7813  * indicates if the list begins at the 0, or is offset to begin at the next
7814  * element.
7815  *
7816  * More about inversion lists can be found in "Unicode Demystified"
7817  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7818  * More will be coming when functionality is added later.
7819  *
7820  * The inversion list data structure is currently implemented as an SV pointing
7821  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7822  * array of UV whose memory management is automatically handled by the existing
7823  * facilities for SV's.
7824  *
7825  * Some of the methods should always be private to the implementation, and some
7826  * should eventually be made public */
7827
7828 /* The header definitions are in F<inline_invlist.c> */
7829
7830 PERL_STATIC_INLINE UV*
7831 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7832 {
7833     /* Returns a pointer to the first element in the inversion list's array.
7834      * This is called upon initialization of an inversion list.  Where the
7835      * array begins depends on whether the list has the code point U+0000 in it
7836      * or not.  The other parameter tells it whether the code that follows this
7837      * call is about to put a 0 in the inversion list or not.  The first
7838      * element is either the element reserved for 0, if TRUE, or the element
7839      * after it, if FALSE */
7840
7841     bool* offset = get_invlist_offset_addr(invlist);
7842     UV* zero_addr = (UV *) SvPVX(invlist);
7843
7844     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7845
7846     /* Must be empty */
7847     assert(! _invlist_len(invlist));
7848
7849     *zero_addr = 0;
7850
7851     /* 1^1 = 0; 1^0 = 1 */
7852     *offset = 1 ^ will_have_0;
7853     return zero_addr + *offset;
7854 }
7855
7856 PERL_STATIC_INLINE UV*
7857 S_invlist_array(SV* const invlist)
7858 {
7859     /* Returns the pointer to the inversion list's array.  Every time the
7860      * length changes, this needs to be called in case malloc or realloc moved
7861      * it */
7862
7863     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7864
7865     /* Must not be empty.  If these fail, you probably didn't check for <len>
7866      * being non-zero before trying to get the array */
7867     assert(_invlist_len(invlist));
7868
7869     /* The very first element always contains zero, The array begins either
7870      * there, or if the inversion list is offset, at the element after it.
7871      * The offset header field determines which; it contains 0 or 1 to indicate
7872      * how much additionally to add */
7873     assert(0 == *(SvPVX(invlist)));
7874     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7875 }
7876
7877 PERL_STATIC_INLINE void
7878 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7879 {
7880     /* Sets the current number of elements stored in the inversion list.
7881      * Updates SvCUR correspondingly */
7882     PERL_UNUSED_CONTEXT;
7883     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7884
7885     assert(SvTYPE(invlist) == SVt_INVLIST);
7886
7887     SvCUR_set(invlist,
7888               (len == 0)
7889                ? 0
7890                : TO_INTERNAL_SIZE(len + offset));
7891     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7892 }
7893
7894 PERL_STATIC_INLINE IV*
7895 S_get_invlist_previous_index_addr(SV* invlist)
7896 {
7897     /* Return the address of the IV that is reserved to hold the cached index
7898      * */
7899     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7900
7901     assert(SvTYPE(invlist) == SVt_INVLIST);
7902
7903     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7904 }
7905
7906 PERL_STATIC_INLINE IV
7907 S_invlist_previous_index(SV* const invlist)
7908 {
7909     /* Returns cached index of previous search */
7910
7911     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7912
7913     return *get_invlist_previous_index_addr(invlist);
7914 }
7915
7916 PERL_STATIC_INLINE void
7917 S_invlist_set_previous_index(SV* const invlist, const IV index)
7918 {
7919     /* Caches <index> for later retrieval */
7920
7921     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7922
7923     assert(index == 0 || index < (int) _invlist_len(invlist));
7924
7925     *get_invlist_previous_index_addr(invlist) = index;
7926 }
7927
7928 PERL_STATIC_INLINE UV
7929 S_invlist_max(SV* const invlist)
7930 {
7931     /* Returns the maximum number of elements storable in the inversion list's
7932      * array, without having to realloc() */
7933
7934     PERL_ARGS_ASSERT_INVLIST_MAX;
7935
7936     assert(SvTYPE(invlist) == SVt_INVLIST);
7937
7938     /* Assumes worst case, in which the 0 element is not counted in the
7939      * inversion list, so subtracts 1 for that */
7940     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7941            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7942            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7943 }
7944
7945 #ifndef PERL_IN_XSUB_RE
7946 SV*
7947 Perl__new_invlist(pTHX_ IV initial_size)
7948 {
7949
7950     /* Return a pointer to a newly constructed inversion list, with enough
7951      * space to store 'initial_size' elements.  If that number is negative, a
7952      * system default is used instead */
7953
7954     SV* new_list;
7955
7956     if (initial_size < 0) {
7957         initial_size = 10;
7958     }
7959
7960     /* Allocate the initial space */
7961     new_list = newSV_type(SVt_INVLIST);
7962
7963     /* First 1 is in case the zero element isn't in the list; second 1 is for
7964      * trailing NUL */
7965     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7966     invlist_set_len(new_list, 0, 0);
7967
7968     /* Force iterinit() to be used to get iteration to work */
7969     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7970
7971     *get_invlist_previous_index_addr(new_list) = 0;
7972
7973     return new_list;
7974 }
7975
7976 SV*
7977 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7978 {
7979     /* Return a pointer to a newly constructed inversion list, initialized to
7980      * point to <list>, which has to be in the exact correct inversion list
7981      * form, including internal fields.  Thus this is a dangerous routine that
7982      * should not be used in the wrong hands.  The passed in 'list' contains
7983      * several header fields at the beginning that are not part of the
7984      * inversion list body proper */
7985
7986     const STRLEN length = (STRLEN) list[0];
7987     const UV version_id =          list[1];
7988     const bool offset   =    cBOOL(list[2]);
7989 #define HEADER_LENGTH 3
7990     /* If any of the above changes in any way, you must change HEADER_LENGTH
7991      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7992      *      perl -E 'say int(rand 2**31-1)'
7993      */
7994 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7995                                         data structure type, so that one being
7996                                         passed in can be validated to be an
7997                                         inversion list of the correct vintage.
7998                                        */
7999
8000     SV* invlist = newSV_type(SVt_INVLIST);
8001
8002     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8003
8004     if (version_id != INVLIST_VERSION_ID) {
8005         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8006     }
8007
8008     /* The generated array passed in includes header elements that aren't part
8009      * of the list proper, so start it just after them */
8010     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8011
8012     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8013                                shouldn't touch it */
8014
8015     *(get_invlist_offset_addr(invlist)) = offset;
8016
8017     /* The 'length' passed to us is the physical number of elements in the
8018      * inversion list.  But if there is an offset the logical number is one
8019      * less than that */
8020     invlist_set_len(invlist, length  - offset, offset);
8021
8022     invlist_set_previous_index(invlist, 0);
8023
8024     /* Initialize the iteration pointer. */
8025     invlist_iterfinish(invlist);
8026
8027     SvREADONLY_on(invlist);
8028
8029     return invlist;
8030 }
8031 #endif /* ifndef PERL_IN_XSUB_RE */
8032
8033 STATIC void
8034 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8035 {
8036     /* Grow the maximum size of an inversion list */
8037
8038     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8039
8040     assert(SvTYPE(invlist) == SVt_INVLIST);
8041
8042     /* Add one to account for the zero element at the beginning which may not
8043      * be counted by the calling parameters */
8044     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8045 }
8046
8047 PERL_STATIC_INLINE void
8048 S_invlist_trim(SV* const invlist)
8049 {
8050     PERL_ARGS_ASSERT_INVLIST_TRIM;
8051
8052     assert(SvTYPE(invlist) == SVt_INVLIST);
8053
8054     /* Change the length of the inversion list to how many entries it currently
8055      * has */
8056     SvPV_shrink_to_cur((SV *) invlist);
8057 }
8058
8059 STATIC void
8060 S__append_range_to_invlist(pTHX_ SV* const invlist,
8061                                  const UV start, const UV end)
8062 {
8063    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8064     * the end of the inversion list.  The range must be above any existing
8065     * ones. */
8066
8067     UV* array;
8068     UV max = invlist_max(invlist);
8069     UV len = _invlist_len(invlist);
8070     bool offset;
8071
8072     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8073
8074     if (len == 0) { /* Empty lists must be initialized */
8075         offset = start != 0;
8076         array = _invlist_array_init(invlist, ! offset);
8077     }
8078     else {
8079         /* Here, the existing list is non-empty. The current max entry in the
8080          * list is generally the first value not in the set, except when the
8081          * set extends to the end of permissible values, in which case it is
8082          * the first entry in that final set, and so this call is an attempt to
8083          * append out-of-order */
8084
8085         UV final_element = len - 1;
8086         array = invlist_array(invlist);
8087         if (array[final_element] > start
8088             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8089         {
8090             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8091                      array[final_element], start,
8092                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8093         }
8094
8095         /* Here, it is a legal append.  If the new range begins with the first
8096          * value not in the set, it is extending the set, so the new first
8097          * value not in the set is one greater than the newly extended range.
8098          * */
8099         offset = *get_invlist_offset_addr(invlist);
8100         if (array[final_element] == start) {
8101             if (end != UV_MAX) {
8102                 array[final_element] = end + 1;
8103             }
8104             else {
8105                 /* But if the end is the maximum representable on the machine,
8106                  * just let the range that this would extend to have no end */
8107                 invlist_set_len(invlist, len - 1, offset);
8108             }
8109             return;
8110         }
8111     }
8112
8113     /* Here the new range doesn't extend any existing set.  Add it */
8114
8115     len += 2;   /* Includes an element each for the start and end of range */
8116
8117     /* If wll overflow the existing space, extend, which may cause the array to
8118      * be moved */
8119     if (max < len) {
8120         invlist_extend(invlist, len);
8121
8122         /* Have to set len here to avoid assert failure in invlist_array() */
8123         invlist_set_len(invlist, len, offset);
8124
8125         array = invlist_array(invlist);
8126     }
8127     else {
8128         invlist_set_len(invlist, len, offset);
8129     }
8130
8131     /* The next item on the list starts the range, the one after that is
8132      * one past the new range.  */
8133     array[len - 2] = start;
8134     if (end != UV_MAX) {
8135         array[len - 1] = end + 1;
8136     }
8137     else {
8138         /* But if the end is the maximum representable on the machine, just let
8139          * the range have no end */
8140         invlist_set_len(invlist, len - 1, offset);
8141     }
8142 }
8143
8144 #ifndef PERL_IN_XSUB_RE
8145
8146 IV
8147 Perl__invlist_search(SV* const invlist, const UV cp)
8148 {
8149     /* Searches the inversion list for the entry that contains the input code
8150      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8151      * return value is the index into the list's array of the range that
8152      * contains <cp> */
8153
8154     IV low = 0;
8155     IV mid;
8156     IV high = _invlist_len(invlist);
8157     const IV highest_element = high - 1;
8158     const UV* array;
8159
8160     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8161
8162     /* If list is empty, return failure. */
8163     if (high == 0) {
8164         return -1;
8165     }
8166
8167     /* (We can't get the array unless we know the list is non-empty) */
8168     array = invlist_array(invlist);
8169
8170     mid = invlist_previous_index(invlist);
8171     assert(mid >=0 && mid <= highest_element);
8172
8173     /* <mid> contains the cache of the result of the previous call to this
8174      * function (0 the first time).  See if this call is for the same result,
8175      * or if it is for mid-1.  This is under the theory that calls to this
8176      * function will often be for related code points that are near each other.
8177      * And benchmarks show that caching gives better results.  We also test
8178      * here if the code point is within the bounds of the list.  These tests
8179      * replace others that would have had to be made anyway to make sure that
8180      * the array bounds were not exceeded, and these give us extra information
8181      * at the same time */
8182     if (cp >= array[mid]) {
8183         if (cp >= array[highest_element]) {
8184             return highest_element;
8185         }
8186
8187         /* Here, array[mid] <= cp < array[highest_element].  This means that
8188          * the final element is not the answer, so can exclude it; it also
8189          * means that <mid> is not the final element, so can refer to 'mid + 1'
8190          * safely */
8191         if (cp < array[mid + 1]) {
8192             return mid;
8193         }
8194         high--;
8195         low = mid + 1;
8196     }
8197     else { /* cp < aray[mid] */
8198         if (cp < array[0]) { /* Fail if outside the array */
8199             return -1;
8200         }
8201         high = mid;
8202         if (cp >= array[mid - 1]) {
8203             goto found_entry;
8204         }
8205     }
8206
8207     /* Binary search.  What we are looking for is <i> such that
8208      *  array[i] <= cp < array[i+1]
8209      * The loop below converges on the i+1.  Note that there may not be an
8210      * (i+1)th element in the array, and things work nonetheless */
8211     while (low < high) {
8212         mid = (low + high) / 2;
8213         assert(mid <= highest_element);
8214         if (array[mid] <= cp) { /* cp >= array[mid] */
8215             low = mid + 1;
8216
8217             /* We could do this extra test to exit the loop early.
8218             if (cp < array[low]) {
8219                 return mid;
8220             }
8221             */
8222         }
8223         else { /* cp < array[mid] */
8224             high = mid;
8225         }
8226     }
8227
8228   found_entry:
8229     high--;
8230     invlist_set_previous_index(invlist, high);
8231     return high;
8232 }
8233
8234 void
8235 Perl__invlist_populate_swatch(SV* const invlist,
8236                               const UV start, const UV end, U8* swatch)
8237 {
8238     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8239      * but is used when the swash has an inversion list.  This makes this much
8240      * faster, as it uses a binary search instead of a linear one.  This is
8241      * intimately tied to that function, and perhaps should be in utf8.c,
8242      * except it is intimately tied to inversion lists as well.  It assumes
8243      * that <swatch> is all 0's on input */
8244
8245     UV current = start;
8246     const IV len = _invlist_len(invlist);
8247     IV i;
8248     const UV * array;
8249
8250     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8251
8252     if (len == 0) { /* Empty inversion list */
8253         return;
8254     }
8255
8256     array = invlist_array(invlist);
8257
8258     /* Find which element it is */
8259     i = _invlist_search(invlist, start);
8260
8261     /* We populate from <start> to <end> */
8262     while (current < end) {
8263         UV upper;
8264
8265         /* The inversion list gives the results for every possible code point
8266          * after the first one in the list.  Only those ranges whose index is
8267          * even are ones that the inversion list matches.  For the odd ones,
8268          * and if the initial code point is not in the list, we have to skip
8269          * forward to the next element */
8270         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8271             i++;
8272             if (i >= len) { /* Finished if beyond the end of the array */
8273                 return;
8274             }
8275             current = array[i];
8276             if (current >= end) {   /* Finished if beyond the end of what we
8277                                        are populating */
8278                 if (LIKELY(end < UV_MAX)) {
8279                     return;
8280                 }
8281
8282                 /* We get here when the upper bound is the maximum
8283                  * representable on the machine, and we are looking for just
8284                  * that code point.  Have to special case it */
8285                 i = len;
8286                 goto join_end_of_list;
8287             }
8288         }
8289         assert(current >= start);
8290
8291         /* The current range ends one below the next one, except don't go past
8292          * <end> */
8293         i++;
8294         upper = (i < len && array[i] < end) ? array[i] : end;
8295
8296         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8297          * for each code point in it */
8298         for (; current < upper; current++) {
8299             const STRLEN offset = (STRLEN)(current - start);
8300             swatch[offset >> 3] |= 1 << (offset & 7);
8301         }
8302
8303     join_end_of_list:
8304
8305         /* Quit if at the end of the list */
8306         if (i >= len) {
8307
8308             /* But first, have to deal with the highest possible code point on
8309              * the platform.  The previous code assumes that <end> is one
8310              * beyond where we want to populate, but that is impossible at the
8311              * platform's infinity, so have to handle it specially */
8312             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8313             {
8314                 const STRLEN offset = (STRLEN)(end - start);
8315                 swatch[offset >> 3] |= 1 << (offset & 7);
8316             }
8317             return;
8318         }
8319
8320         /* Advance to the next range, which will be for code points not in the
8321          * inversion list */
8322         current = array[i];
8323     }
8324
8325     return;
8326 }
8327
8328 void
8329 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8330                                          const bool complement_b, SV** output)
8331 {
8332     /* Take the union of two inversion lists and point <output> to it.  *output
8333      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8334      * the reference count to that list will be decremented if not already a
8335      * temporary (mortal); otherwise *output will be made correspondingly
8336      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8337      * second list is returned.  If <complement_b> is TRUE, the union is taken
8338      * of the complement (inversion) of <b> instead of b itself.
8339      *
8340      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8341      * Richard Gillam, published by Addison-Wesley, and explained at some
8342      * length there.  The preface says to incorporate its examples into your
8343      * code at your own risk.
8344      *
8345      * The algorithm is like a merge sort.
8346      *
8347      * XXX A potential performance improvement is to keep track as we go along
8348      * if only one of the inputs contributes to the result, meaning the other
8349      * is a subset of that one.  In that case, we can skip the final copy and
8350      * return the larger of the input lists, but then outside code might need
8351      * to keep track of whether to free the input list or not */
8352
8353     const UV* array_a;    /* a's array */
8354     const UV* array_b;
8355     UV len_a;       /* length of a's array */
8356     UV len_b;
8357
8358     SV* u;                      /* the resulting union */
8359     UV* array_u;
8360     UV len_u;
8361
8362     UV i_a = 0;             /* current index into a's array */
8363     UV i_b = 0;
8364     UV i_u = 0;
8365
8366     /* running count, as explained in the algorithm source book; items are
8367      * stopped accumulating and are output when the count changes to/from 0.
8368      * The count is incremented when we start a range that's in the set, and
8369      * decremented when we start a range that's not in the set.  So its range
8370      * is 0 to 2.  Only when the count is zero is something not in the set.
8371      */
8372     UV count = 0;
8373
8374     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8375     assert(a != b);
8376
8377     /* If either one is empty, the union is the other one */
8378     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8379         bool make_temp = FALSE; /* Should we mortalize the result? */
8380
8381         if (*output == a) {
8382             if (a != NULL) {
8383                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8384                     SvREFCNT_dec_NN(a);
8385                 }
8386             }
8387         }
8388         if (*output != b) {
8389             *output = invlist_clone(b);
8390             if (complement_b) {
8391                 _invlist_invert(*output);
8392             }
8393         } /* else *output already = b; */
8394
8395         if (make_temp) {
8396             sv_2mortal(*output);
8397         }
8398         return;
8399     }
8400     else if ((len_b = _invlist_len(b)) == 0) {
8401         bool make_temp = FALSE;
8402         if (*output == b) {
8403             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8404                 SvREFCNT_dec_NN(b);
8405             }
8406         }
8407
8408         /* The complement of an empty list is a list that has everything in it,
8409          * so the union with <a> includes everything too */
8410         if (complement_b) {
8411             if (a == *output) {
8412                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8413                     SvREFCNT_dec_NN(a);
8414                 }
8415             }
8416             *output = _new_invlist(1);
8417             _append_range_to_invlist(*output, 0, UV_MAX);
8418         }
8419         else if (*output != a) {
8420             *output = invlist_clone(a);
8421         }
8422         /* else *output already = a; */
8423
8424         if (make_temp) {
8425             sv_2mortal(*output);
8426         }
8427         return;
8428     }
8429
8430     /* Here both lists exist and are non-empty */
8431     array_a = invlist_array(a);
8432     array_b = invlist_array(b);
8433
8434     /* If are to take the union of 'a' with the complement of b, set it
8435      * up so are looking at b's complement. */
8436     if (complement_b) {
8437
8438         /* To complement, we invert: if the first element is 0, remove it.  To
8439          * do this, we just pretend the array starts one later */
8440         if (array_b[0] == 0) {
8441             array_b++;
8442             len_b--;
8443         }
8444         else {
8445
8446             /* But if the first element is not zero, we pretend the list starts
8447              * at the 0 that is always stored immediately before the array. */
8448             array_b--;
8449             len_b++;
8450         }
8451     }
8452
8453     /* Size the union for the worst case: that the sets are completely
8454      * disjoint */
8455     u = _new_invlist(len_a + len_b);
8456
8457     /* Will contain U+0000 if either component does */
8458     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8459                                       || (len_b > 0 && array_b[0] == 0));
8460
8461     /* Go through each list item by item, stopping when exhausted one of
8462      * them */
8463     while (i_a < len_a && i_b < len_b) {
8464         UV cp;      /* The element to potentially add to the union's array */
8465         bool cp_in_set;   /* is it in the the input list's set or not */
8466
8467         /* We need to take one or the other of the two inputs for the union.
8468          * Since we are merging two sorted lists, we take the smaller of the
8469          * next items.  In case of a tie, we take the one that is in its set
8470          * first.  If we took one not in the set first, it would decrement the
8471          * count, possibly to 0 which would cause it to be output as ending the
8472          * range, and the next time through we would take the same number, and
8473          * output it again as beginning the next range.  By doing it the
8474          * opposite way, there is no possibility that the count will be
8475          * momentarily decremented to 0, and thus the two adjoining ranges will
8476          * be seamlessly merged.  (In a tie and both are in the set or both not
8477          * in the set, it doesn't matter which we take first.) */
8478         if (array_a[i_a] < array_b[i_b]
8479             || (array_a[i_a] == array_b[i_b]
8480                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8481         {
8482             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8483             cp= array_a[i_a++];
8484         }
8485         else {
8486             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8487             cp = array_b[i_b++];
8488         }
8489
8490         /* Here, have chosen which of the two inputs to look at.  Only output
8491          * if the running count changes to/from 0, which marks the
8492          * beginning/end of a range in that's in the set */
8493         if (cp_in_set) {
8494             if (count == 0) {
8495                 array_u[i_u++] = cp;
8496             }
8497             count++;
8498         }
8499         else {
8500             count--;
8501             if (count == 0) {
8502                 array_u[i_u++] = cp;
8503             }
8504         }
8505     }
8506
8507     /* Here, we are finished going through at least one of the lists, which
8508      * means there is something remaining in at most one.  We check if the list
8509      * that hasn't been exhausted is positioned such that we are in the middle
8510      * of a range in its set or not.  (i_a and i_b point to the element beyond
8511      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8512      * is potentially more to output.
8513      * There are four cases:
8514      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8515      *     in the union is entirely from the non-exhausted set.
8516      *  2) Both were in their sets, count is 2.  Nothing further should
8517      *     be output, as everything that remains will be in the exhausted
8518      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8519      *     that
8520      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8521      *     Nothing further should be output because the union includes
8522      *     everything from the exhausted set.  Not decrementing ensures that.
8523      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8524      *     decrementing to 0 insures that we look at the remainder of the
8525      *     non-exhausted set */
8526     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8527         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8528     {
8529         count--;
8530     }
8531
8532     /* The final length is what we've output so far, plus what else is about to
8533      * be output.  (If 'count' is non-zero, then the input list we exhausted
8534      * has everything remaining up to the machine's limit in its set, and hence
8535      * in the union, so there will be no further output. */
8536     len_u = i_u;
8537     if (count == 0) {
8538         /* At most one of the subexpressions will be non-zero */
8539         len_u += (len_a - i_a) + (len_b - i_b);
8540     }
8541
8542     /* Set result to final length, which can change the pointer to array_u, so
8543      * re-find it */
8544     if (len_u != _invlist_len(u)) {
8545         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8546         invlist_trim(u);
8547         array_u = invlist_array(u);
8548     }
8549
8550     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8551      * the other) ended with everything above it not in its set.  That means
8552      * that the remaining part of the union is precisely the same as the
8553      * non-exhausted list, so can just copy it unchanged.  (If both list were
8554      * exhausted at the same time, then the operations below will be both 0.)
8555      */
8556     if (count == 0) {
8557         IV copy_count; /* At most one will have a non-zero copy count */
8558         if ((copy_count = len_a - i_a) > 0) {
8559             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8560         }
8561         else if ((copy_count = len_b - i_b) > 0) {
8562             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8563         }
8564     }
8565
8566     /*  We may be removing a reference to one of the inputs.  If so, the output
8567      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8568      *  count decremented) */
8569     if (a == *output || b == *output) {
8570         assert(! invlist_is_iterating(*output));
8571         if ((SvTEMP(*output))) {
8572             sv_2mortal(u);
8573         }
8574         else {
8575             SvREFCNT_dec_NN(*output);
8576         }
8577     }
8578
8579     *output = u;
8580
8581     return;
8582 }
8583
8584 void
8585 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8586                                                const bool complement_b, SV** i)
8587 {
8588     /* Take the intersection of two inversion lists and point <i> to it.  *i
8589      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8590      * the reference count to that list will be decremented if not already a
8591      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8592      * The first list, <a>, may be NULL, in which case an empty list is
8593      * returned.  If <complement_b> is TRUE, the result will be the
8594      * intersection of <a> and the complement (or inversion) of <b> instead of
8595      * <b> directly.
8596      *
8597      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8598      * Richard Gillam, published by Addison-Wesley, and explained at some
8599      * length there.  The preface says to incorporate its examples into your
8600      * code at your own risk.  In fact, it had bugs
8601      *
8602      * The algorithm is like a merge sort, and is essentially the same as the
8603      * union above
8604      */
8605
8606     const UV* array_a;          /* a's array */
8607     const UV* array_b;
8608     UV len_a;   /* length of a's array */
8609     UV len_b;
8610
8611     SV* r;                   /* the resulting intersection */
8612     UV* array_r;
8613     UV len_r;
8614
8615     UV i_a = 0;             /* current index into a's array */
8616     UV i_b = 0;
8617     UV i_r = 0;
8618
8619     /* running count, as explained in the algorithm source book; items are
8620      * stopped accumulating and are output when the count changes to/from 2.
8621      * The count is incremented when we start a range that's in the set, and
8622      * decremented when we start a range that's not in the set.  So its range
8623      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8624      */
8625     UV count = 0;
8626
8627     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8628     assert(a != b);
8629
8630     /* Special case if either one is empty */
8631     len_a = (a == NULL) ? 0 : _invlist_len(a);
8632     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8633         bool make_temp = FALSE;
8634
8635         if (len_a != 0 && complement_b) {
8636
8637             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8638              * be empty.  Here, also we are using 'b's complement, which hence
8639              * must be every possible code point.  Thus the intersection is
8640              * simply 'a'. */
8641             if (*i != a) {
8642                 if (*i == b) {
8643                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8644                         SvREFCNT_dec_NN(b);
8645                     }
8646                 }
8647
8648                 *i = invlist_clone(a);
8649             }
8650             /* else *i is already 'a' */
8651
8652             if (make_temp) {
8653                 sv_2mortal(*i);
8654             }
8655             return;
8656         }
8657
8658         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8659          * intersection must be empty */
8660         if (*i == a) {
8661             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8662                 SvREFCNT_dec_NN(a);
8663             }
8664         }
8665         else if (*i == b) {
8666             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8667                 SvREFCNT_dec_NN(b);
8668             }
8669         }
8670         *i = _new_invlist(0);
8671         if (make_temp) {
8672             sv_2mortal(*i);
8673         }
8674
8675         return;
8676     }
8677
8678     /* Here both lists exist and are non-empty */
8679     array_a = invlist_array(a);
8680     array_b = invlist_array(b);
8681
8682     /* If are to take the intersection of 'a' with the complement of b, set it
8683      * up so are looking at b's complement. */
8684     if (complement_b) {
8685
8686         /* To complement, we invert: if the first element is 0, remove it.  To
8687          * do this, we just pretend the array starts one later */
8688         if (array_b[0] == 0) {
8689             array_b++;
8690             len_b--;
8691         }
8692         else {
8693
8694             /* But if the first element is not zero, we pretend the list starts
8695              * at the 0 that is always stored immediately before the array. */
8696             array_b--;
8697             len_b++;
8698         }
8699     }
8700
8701     /* Size the intersection for the worst case: that the intersection ends up
8702      * fragmenting everything to be completely disjoint */
8703     r= _new_invlist(len_a + len_b);
8704
8705     /* Will contain U+0000 iff both components do */
8706     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8707                                      && len_b > 0 && array_b[0] == 0);
8708
8709     /* Go through each list item by item, stopping when exhausted one of
8710      * them */
8711     while (i_a < len_a && i_b < len_b) {
8712         UV cp;      /* The element to potentially add to the intersection's
8713                        array */
8714         bool cp_in_set; /* Is it in the input list's set or not */
8715
8716         /* We need to take one or the other of the two inputs for the
8717          * intersection.  Since we are merging two sorted lists, we take the
8718          * smaller of the next items.  In case of a tie, we take the one that
8719          * is not in its set first (a difference from the union algorithm).  If
8720          * we took one in the set first, it would increment the count, possibly
8721          * to 2 which would cause it to be output as starting a range in the
8722          * intersection, and the next time through we would take that same
8723          * number, and output it again as ending the set.  By doing it the
8724          * opposite of this, there is no possibility that the count will be
8725          * momentarily incremented to 2.  (In a tie and both are in the set or
8726          * both not in the set, it doesn't matter which we take first.) */
8727         if (array_a[i_a] < array_b[i_b]
8728             || (array_a[i_a] == array_b[i_b]
8729                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8730         {
8731             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8732             cp= array_a[i_a++];
8733         }
8734         else {
8735             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8736             cp= array_b[i_b++];
8737         }
8738
8739         /* Here, have chosen which of the two inputs to look at.  Only output
8740          * if the running count changes to/from 2, which marks the
8741          * beginning/end of a range that's in the intersection */
8742         if (cp_in_set) {
8743             count++;
8744             if (count == 2) {
8745                 array_r[i_r++] = cp;
8746             }
8747         }
8748         else {
8749             if (count == 2) {
8750                 array_r[i_r++] = cp;
8751             }
8752             count--;
8753         }
8754     }
8755
8756     /* Here, we are finished going through at least one of the lists, which
8757      * means there is something remaining in at most one.  We check if the list
8758      * that has been exhausted is positioned such that we are in the middle
8759      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8760      * the ones we care about.)  There are four cases:
8761      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8762      *     nothing left in the intersection.
8763      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8764      *     above 2.  What should be output is exactly that which is in the
8765      *     non-exhausted set, as everything it has is also in the intersection
8766      *     set, and everything it doesn't have can't be in the intersection
8767      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8768      *     gets incremented to 2.  Like the previous case, the intersection is
8769      *     everything that remains in the non-exhausted set.
8770      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8771      *     remains 1.  And the intersection has nothing more. */
8772     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8773         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8774     {
8775         count++;
8776     }
8777
8778     /* The final length is what we've output so far plus what else is in the
8779      * intersection.  At most one of the subexpressions below will be non-zero
8780      * */
8781     len_r = i_r;
8782     if (count >= 2) {
8783         len_r += (len_a - i_a) + (len_b - i_b);
8784     }
8785
8786     /* Set result to final length, which can change the pointer to array_r, so
8787      * re-find it */
8788     if (len_r != _invlist_len(r)) {
8789         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8790         invlist_trim(r);
8791         array_r = invlist_array(r);
8792     }
8793
8794     /* Finish outputting any remaining */
8795     if (count >= 2) { /* At most one will have a non-zero copy count */
8796         IV copy_count;
8797         if ((copy_count = len_a - i_a) > 0) {
8798             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8799         }
8800         else if ((copy_count = len_b - i_b) > 0) {
8801             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8802         }
8803     }
8804
8805     /*  We may be removing a reference to one of the inputs.  If so, the output
8806      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8807      *  count decremented) */
8808     if (a == *i || b == *i) {
8809         assert(! invlist_is_iterating(*i));
8810         if (SvTEMP(*i)) {
8811             sv_2mortal(r);
8812         }
8813         else {
8814             SvREFCNT_dec_NN(*i);
8815         }
8816     }
8817
8818     *i = r;
8819
8820     return;
8821 }
8822
8823 SV*
8824 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8825 {
8826     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8827      * set.  A pointer to the inversion list is returned.  This may actually be
8828      * a new list, in which case the passed in one has been destroyed.  The
8829      * passed in inversion list can be NULL, in which case a new one is created
8830      * with just the one range in it */
8831
8832     SV* range_invlist;
8833     UV len;
8834
8835     if (invlist == NULL) {
8836         invlist = _new_invlist(2);
8837         len = 0;
8838     }
8839     else {
8840         len = _invlist_len(invlist);
8841     }
8842
8843     /* If comes after the final entry actually in the list, can just append it
8844      * to the end, */
8845     if (len == 0
8846         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8847             && start >= invlist_array(invlist)[len - 1]))
8848     {
8849         _append_range_to_invlist(invlist, start, end);
8850         return invlist;
8851     }
8852
8853     /* Here, can't just append things, create and return a new inversion list
8854      * which is the union of this range and the existing inversion list */
8855     range_invlist = _new_invlist(2);
8856     _append_range_to_invlist(range_invlist, start, end);
8857
8858     _invlist_union(invlist, range_invlist, &invlist);
8859
8860     /* The temporary can be freed */
8861     SvREFCNT_dec_NN(range_invlist);
8862
8863     return invlist;
8864 }
8865
8866 SV*
8867 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8868                                  UV** other_elements_ptr)
8869 {
8870     /* Create and return an inversion list whose contents are to be populated
8871      * by the caller.  The caller gives the number of elements (in 'size') and
8872      * the very first element ('element0').  This function will set
8873      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8874      * are to be placed.
8875      *
8876      * Obviously there is some trust involved that the caller will properly
8877      * fill in the other elements of the array.
8878      *
8879      * (The first element needs to be passed in, as the underlying code does
8880      * things differently depending on whether it is zero or non-zero) */
8881
8882     SV* invlist = _new_invlist(size);
8883     bool offset;
8884
8885     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8886
8887     _append_range_to_invlist(invlist, element0, element0);
8888     offset = *get_invlist_offset_addr(invlist);
8889
8890     invlist_set_len(invlist, size, offset);
8891     *other_elements_ptr = invlist_array(invlist) + 1;
8892     return invlist;
8893 }
8894
8895 #endif
8896
8897 PERL_STATIC_INLINE SV*
8898 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8899     return _add_range_to_invlist(invlist, cp, cp);
8900 }
8901
8902 #ifndef PERL_IN_XSUB_RE
8903 void
8904 Perl__invlist_invert(pTHX_ SV* const invlist)
8905 {
8906     /* Complement the input inversion list.  This adds a 0 if the list didn't
8907      * have a zero; removes it otherwise.  As described above, the data
8908      * structure is set up so that this is very efficient */
8909
8910     PERL_ARGS_ASSERT__INVLIST_INVERT;
8911
8912     assert(! invlist_is_iterating(invlist));
8913
8914     /* The inverse of matching nothing is matching everything */
8915     if (_invlist_len(invlist) == 0) {
8916         _append_range_to_invlist(invlist, 0, UV_MAX);
8917         return;
8918     }
8919
8920     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8921 }
8922
8923 #endif
8924
8925 PERL_STATIC_INLINE SV*
8926 S_invlist_clone(pTHX_ SV* const invlist)
8927 {
8928
8929     /* Return a new inversion list that is a copy of the input one, which is
8930      * unchanged.  The new list will not be mortal even if the old one was. */
8931
8932     /* Need to allocate extra space to accommodate Perl's addition of a
8933      * trailing NUL to SvPV's, since it thinks they are always strings */
8934     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8935     STRLEN physical_length = SvCUR(invlist);
8936     bool offset = *(get_invlist_offset_addr(invlist));
8937
8938     PERL_ARGS_ASSERT_INVLIST_CLONE;
8939
8940     *(get_invlist_offset_addr(new_invlist)) = offset;
8941     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8942     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8943
8944     return new_invlist;
8945 }
8946
8947 PERL_STATIC_INLINE STRLEN*
8948 S_get_invlist_iter_addr(SV* invlist)
8949 {
8950     /* Return the address of the UV that contains the current iteration
8951      * position */
8952
8953     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8954
8955     assert(SvTYPE(invlist) == SVt_INVLIST);
8956
8957     return &(((XINVLIST*) SvANY(invlist))->iterator);
8958 }
8959
8960 PERL_STATIC_INLINE void
8961 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8962 {
8963     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8964
8965     *get_invlist_iter_addr(invlist) = 0;
8966 }
8967
8968 PERL_STATIC_INLINE void
8969 S_invlist_iterfinish(SV* invlist)
8970 {
8971     /* Terminate iterator for invlist.  This is to catch development errors.
8972      * Any iteration that is interrupted before completed should call this
8973      * function.  Functions that add code points anywhere else but to the end
8974      * of an inversion list assert that they are not in the middle of an
8975      * iteration.  If they were, the addition would make the iteration
8976      * problematical: if the iteration hadn't reached the place where things
8977      * were being added, it would be ok */
8978
8979     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8980
8981     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8982 }
8983
8984 STATIC bool
8985 S_invlist_iternext(SV* invlist, UV* start, UV* end)
8986 {
8987     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8988      * This call sets in <*start> and <*end>, the next range in <invlist>.
8989      * Returns <TRUE> if successful and the next call will return the next
8990      * range; <FALSE> if was already at the end of the list.  If the latter,
8991      * <*start> and <*end> are unchanged, and the next call to this function
8992      * will start over at the beginning of the list */
8993
8994     STRLEN* pos = get_invlist_iter_addr(invlist);
8995     UV len = _invlist_len(invlist);
8996     UV *array;
8997
8998     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8999
9000     if (*pos >= len) {
9001         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9002         return FALSE;
9003     }
9004
9005     array = invlist_array(invlist);
9006
9007     *start = array[(*pos)++];
9008
9009     if (*pos >= len) {
9010         *end = UV_MAX;
9011     }
9012     else {
9013         *end = array[(*pos)++] - 1;
9014     }
9015
9016     return TRUE;
9017 }
9018
9019 PERL_STATIC_INLINE bool
9020 S_invlist_is_iterating(SV* const invlist)
9021 {
9022     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9023
9024     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9025 }
9026
9027 PERL_STATIC_INLINE UV
9028 S_invlist_highest(SV* const invlist)
9029 {
9030     /* Returns the highest code point that matches an inversion list.  This API
9031      * has an ambiguity, as it returns 0 under either the highest is actually
9032      * 0, or if the list is empty.  If this distinction matters to you, check
9033      * for emptiness before calling this function */
9034
9035     UV len = _invlist_len(invlist);
9036     UV *array;
9037
9038     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9039
9040     if (len == 0) {
9041         return 0;
9042     }
9043
9044     array = invlist_array(invlist);
9045
9046     /* The last element in the array in the inversion list always starts a
9047      * range that goes to infinity.  That range may be for code points that are
9048      * matched in the inversion list, or it may be for ones that aren't
9049      * matched.  In the latter case, the highest code point in the set is one
9050      * less than the beginning of this range; otherwise it is the final element
9051      * of this range: infinity */
9052     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9053            ? UV_MAX
9054            : array[len - 1] - 1;
9055 }
9056
9057 #ifndef PERL_IN_XSUB_RE
9058 SV *
9059 Perl__invlist_contents(pTHX_ SV* const invlist)
9060 {
9061     /* Get the contents of an inversion list into a string SV so that they can
9062      * be printed out.  It uses the format traditionally done for debug tracing
9063      */
9064
9065     UV start, end;
9066     SV* output = newSVpvs("\n");
9067
9068     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9069
9070     assert(! invlist_is_iterating(invlist));
9071
9072     invlist_iterinit(invlist);
9073     while (invlist_iternext(invlist, &start, &end)) {
9074         if (end == UV_MAX) {
9075             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9076         }
9077         else if (end != start) {
9078             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9079                     start,       end);
9080         }
9081         else {
9082             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9083         }
9084     }
9085
9086     return output;
9087 }
9088 #endif
9089
9090 #ifndef PERL_IN_XSUB_RE
9091 void
9092 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9093                          const char * const indent, SV* const invlist)
9094 {
9095     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9096      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9097      * the string 'indent'.  The output looks like this:
9098          [0] 0x000A .. 0x000D
9099          [2] 0x0085
9100          [4] 0x2028 .. 0x2029
9101          [6] 0x3104 .. INFINITY
9102      * This means that the first range of code points matched by the list are
9103      * 0xA through 0xD; the second range contains only the single code point
9104      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9105      * are used to define each range (except if the final range extends to
9106      * infinity, only a single element is needed).  The array index of the
9107      * first element for the corresponding range is given in brackets. */
9108
9109     UV start, end;
9110     STRLEN count = 0;
9111
9112     PERL_ARGS_ASSERT__INVLIST_DUMP;
9113
9114     if (invlist_is_iterating(invlist)) {
9115         Perl_dump_indent(aTHX_ level, file,
9116              "%sCan't dump inversion list because is in middle of iterating\n",
9117              indent);
9118         return;
9119     }
9120
9121     invlist_iterinit(invlist);
9122     while (invlist_iternext(invlist, &start, &end)) {
9123         if (end == UV_MAX) {
9124             Perl_dump_indent(aTHX_ level, file,
9125                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9126                                    indent, (UV)count, start);
9127         }
9128         else if (end != start) {
9129             Perl_dump_indent(aTHX_ level, file,
9130                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9131                                 indent, (UV)count, start,         end);
9132         }
9133         else {
9134             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9135                                             indent, (UV)count, start);
9136         }
9137         count += 2;
9138     }
9139 }
9140
9141 void
9142 Perl__load_PL_utf8_foldclosures (pTHX)
9143 {
9144     assert(! PL_utf8_foldclosures);
9145
9146     /* If the folds haven't been read in, call a fold function
9147      * to force that */
9148     if (! PL_utf8_tofold) {
9149         U8 dummy[UTF8_MAXBYTES_CASE+1];
9150
9151         /* This string is just a short named one above \xff */
9152         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9153         assert(PL_utf8_tofold); /* Verify that worked */
9154     }
9155     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9156 }
9157 #endif
9158
9159 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9160 bool
9161 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9162 {
9163     /* Return a boolean as to if the two passed in inversion lists are
9164      * identical.  The final argument, if TRUE, says to take the complement of
9165      * the second inversion list before doing the comparison */
9166
9167     const UV* array_a = invlist_array(a);
9168     const UV* array_b = invlist_array(b);
9169     UV len_a = _invlist_len(a);
9170     UV len_b = _invlist_len(b);
9171
9172     UV i = 0;               /* current index into the arrays */
9173     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9174
9175     PERL_ARGS_ASSERT__INVLISTEQ;
9176
9177     /* If are to compare 'a' with the complement of b, set it
9178      * up so are looking at b's complement. */
9179     if (complement_b) {
9180
9181         /* The complement of nothing is everything, so <a> would have to have
9182          * just one element, starting at zero (ending at infinity) */
9183         if (len_b == 0) {
9184             return (len_a == 1 && array_a[0] == 0);
9185         }
9186         else if (array_b[0] == 0) {
9187
9188             /* Otherwise, to complement, we invert.  Here, the first element is
9189              * 0, just remove it.  To do this, we just pretend the array starts
9190              * one later */
9191
9192             array_b++;
9193             len_b--;
9194         }
9195         else {
9196
9197             /* But if the first element is not zero, we pretend the list starts
9198              * at the 0 that is always stored immediately before the array. */
9199             array_b--;
9200             len_b++;
9201         }
9202     }
9203
9204     /* Make sure that the lengths are the same, as well as the final element
9205      * before looping through the remainder.  (Thus we test the length, final,
9206      * and first elements right off the bat) */
9207     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9208         retval = FALSE;
9209     }
9210     else for (i = 0; i < len_a - 1; i++) {
9211         if (array_a[i] != array_b[i]) {
9212             retval = FALSE;
9213             break;
9214         }
9215     }
9216
9217     return retval;
9218 }
9219 #endif
9220
9221 #undef HEADER_LENGTH
9222 #undef TO_INTERNAL_SIZE
9223 #undef FROM_INTERNAL_SIZE
9224 #undef INVLIST_VERSION_ID
9225
9226 /* End of inversion list object */
9227
9228 STATIC void
9229 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9230 {
9231     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9232      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9233      * should point to the first flag; it is updated on output to point to the
9234      * final ')' or ':'.  There needs to be at least one flag, or this will
9235      * abort */
9236
9237     /* for (?g), (?gc), and (?o) warnings; warning
9238        about (?c) will warn about (?g) -- japhy    */
9239
9240 #define WASTED_O  0x01
9241 #define WASTED_G  0x02
9242 #define WASTED_C  0x04
9243 #define WASTED_GC (WASTED_G|WASTED_C)
9244     I32 wastedflags = 0x00;
9245     U32 posflags = 0, negflags = 0;
9246     U32 *flagsp = &posflags;
9247     char has_charset_modifier = '\0';
9248     regex_charset cs;
9249     bool has_use_defaults = FALSE;
9250     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9251
9252     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9253
9254     /* '^' as an initial flag sets certain defaults */
9255     if (UCHARAT(RExC_parse) == '^') {
9256         RExC_parse++;
9257         has_use_defaults = TRUE;
9258         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9259         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9260                                         ? REGEX_UNICODE_CHARSET
9261                                         : REGEX_DEPENDS_CHARSET);
9262     }
9263
9264     cs = get_regex_charset(RExC_flags);
9265     if (cs == REGEX_DEPENDS_CHARSET
9266         && (RExC_utf8 || RExC_uni_semantics))
9267     {
9268         cs = REGEX_UNICODE_CHARSET;
9269     }
9270
9271     while (*RExC_parse) {
9272         /* && strchr("iogcmsx", *RExC_parse) */
9273         /* (?g), (?gc) and (?o) are useless here
9274            and must be globally applied -- japhy */
9275         switch (*RExC_parse) {
9276
9277             /* Code for the imsx flags */
9278             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9279
9280             case LOCALE_PAT_MOD:
9281                 if (has_charset_modifier) {
9282                     goto excess_modifier;
9283                 }
9284                 else if (flagsp == &negflags) {
9285                     goto neg_modifier;
9286                 }
9287                 cs = REGEX_LOCALE_CHARSET;
9288                 has_charset_modifier = LOCALE_PAT_MOD;
9289                 break;
9290             case UNICODE_PAT_MOD:
9291                 if (has_charset_modifier) {
9292                     goto excess_modifier;
9293                 }
9294                 else if (flagsp == &negflags) {
9295                     goto neg_modifier;
9296                 }
9297                 cs = REGEX_UNICODE_CHARSET;
9298                 has_charset_modifier = UNICODE_PAT_MOD;
9299                 break;
9300             case ASCII_RESTRICT_PAT_MOD:
9301                 if (flagsp == &negflags) {
9302                     goto neg_modifier;
9303                 }
9304                 if (has_charset_modifier) {
9305                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9306                         goto excess_modifier;
9307                     }
9308                     /* Doubled modifier implies more restricted */
9309                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9310                 }
9311                 else {
9312                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9313                 }
9314                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9315                 break;
9316             case DEPENDS_PAT_MOD:
9317                 if (has_use_defaults) {
9318                     goto fail_modifiers;
9319                 }
9320                 else if (flagsp == &negflags) {
9321                     goto neg_modifier;
9322                 }
9323                 else if (has_charset_modifier) {
9324                     goto excess_modifier;
9325                 }
9326
9327                 /* The dual charset means unicode semantics if the
9328                  * pattern (or target, not known until runtime) are
9329                  * utf8, or something in the pattern indicates unicode
9330                  * semantics */
9331                 cs = (RExC_utf8 || RExC_uni_semantics)
9332                      ? REGEX_UNICODE_CHARSET
9333                      : REGEX_DEPENDS_CHARSET;
9334                 has_charset_modifier = DEPENDS_PAT_MOD;
9335                 break;
9336             excess_modifier:
9337                 RExC_parse++;
9338                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9339                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9340                 }
9341                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9342                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9343                                         *(RExC_parse - 1));
9344                 }
9345                 else {
9346                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9347                 }
9348                 /*NOTREACHED*/
9349             neg_modifier:
9350                 RExC_parse++;
9351                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9352                                     *(RExC_parse - 1));
9353                 /*NOTREACHED*/
9354             case ONCE_PAT_MOD: /* 'o' */
9355             case GLOBAL_PAT_MOD: /* 'g' */
9356                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9357                     const I32 wflagbit = *RExC_parse == 'o'
9358                                          ? WASTED_O
9359                                          : WASTED_G;
9360                     if (! (wastedflags & wflagbit) ) {
9361                         wastedflags |= wflagbit;
9362                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9363                         vWARN5(
9364                             RExC_parse + 1,
9365                             "Useless (%s%c) - %suse /%c modifier",
9366                             flagsp == &negflags ? "?-" : "?",
9367                             *RExC_parse,
9368                             flagsp == &negflags ? "don't " : "",
9369                             *RExC_parse
9370                         );
9371                     }
9372                 }
9373                 break;
9374
9375             case CONTINUE_PAT_MOD: /* 'c' */
9376                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9377                     if (! (wastedflags & WASTED_C) ) {
9378                         wastedflags |= WASTED_GC;
9379                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9380                         vWARN3(
9381                             RExC_parse + 1,
9382                             "Useless (%sc) - %suse /gc modifier",
9383                             flagsp == &negflags ? "?-" : "?",
9384                             flagsp == &negflags ? "don't " : ""
9385                         );
9386                     }
9387                 }
9388                 break;
9389             case KEEPCOPY_PAT_MOD: /* 'p' */
9390                 if (flagsp == &negflags) {
9391                     if (SIZE_ONLY)
9392                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9393                 } else {
9394                     *flagsp |= RXf_PMf_KEEPCOPY;
9395                 }
9396                 break;
9397             case '-':
9398                 /* A flag is a default iff it is following a minus, so
9399                  * if there is a minus, it means will be trying to
9400                  * re-specify a default which is an error */
9401                 if (has_use_defaults || flagsp == &negflags) {
9402                     goto fail_modifiers;
9403                 }
9404                 flagsp = &negflags;
9405                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9406                 break;
9407             case ':':
9408             case ')':
9409                 RExC_flags |= posflags;
9410                 RExC_flags &= ~negflags;
9411                 set_regex_charset(&RExC_flags, cs);
9412                 if (RExC_flags & RXf_PMf_FOLD) {
9413                     RExC_contains_i = 1;
9414                 }
9415                 return;
9416                 /*NOTREACHED*/
9417             default:
9418             fail_modifiers:
9419                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9420                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9421                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9422                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9423                 /*NOTREACHED*/
9424         }
9425
9426         ++RExC_parse;
9427     }
9428 }
9429
9430 /*
9431  - reg - regular expression, i.e. main body or parenthesized thing
9432  *
9433  * Caller must absorb opening parenthesis.
9434  *
9435  * Combining parenthesis handling with the base level of regular expression
9436  * is a trifle forced, but the need to tie the tails of the branches to what
9437  * follows makes it hard to avoid.
9438  */
9439 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9440 #ifdef DEBUGGING
9441 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9442 #else
9443 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9444 #endif
9445
9446 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9447    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9448    needs to be restarted.
9449    Otherwise would only return NULL if regbranch() returns NULL, which
9450    cannot happen.  */
9451 STATIC regnode *
9452 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9453     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9454      * 2 is like 1, but indicates that nextchar() has been called to advance
9455      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9456      * this flag alerts us to the need to check for that */
9457 {
9458     regnode *ret;               /* Will be the head of the group. */
9459     regnode *br;
9460     regnode *lastbr;
9461     regnode *ender = NULL;
9462     I32 parno = 0;
9463     I32 flags;
9464     U32 oregflags = RExC_flags;
9465     bool have_branch = 0;
9466     bool is_open = 0;
9467     I32 freeze_paren = 0;
9468     I32 after_freeze = 0;
9469     I32 num; /* numeric backreferences */
9470
9471     char * parse_start = RExC_parse; /* MJD */
9472     char * const oregcomp_parse = RExC_parse;
9473
9474     GET_RE_DEBUG_FLAGS_DECL;
9475
9476     PERL_ARGS_ASSERT_REG;
9477     DEBUG_PARSE("reg ");
9478
9479     *flagp = 0;                         /* Tentatively. */
9480
9481
9482     /* Make an OPEN node, if parenthesized. */
9483     if (paren) {
9484
9485         /* Under /x, space and comments can be gobbled up between the '(' and
9486          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9487          * intervening space, as the sequence is a token, and a token should be
9488          * indivisible */
9489         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9490
9491         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9492             char *start_verb = RExC_parse;
9493             STRLEN verb_len = 0;
9494             char *start_arg = NULL;
9495             unsigned char op = 0;
9496             int argok = 1;
9497             int internal_argval = 0; /* internal_argval is only useful if
9498                                         !argok */
9499
9500             if (has_intervening_patws) {
9501                 RExC_parse++;
9502                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9503             }
9504             while ( *RExC_parse && *RExC_parse != ')' ) {
9505                 if ( *RExC_parse == ':' ) {
9506                     start_arg = RExC_parse + 1;
9507                     break;
9508                 }
9509                 RExC_parse++;
9510             }
9511             ++start_verb;
9512             verb_len = RExC_parse - start_verb;
9513             if ( start_arg ) {
9514                 RExC_parse++;
9515                 while ( *RExC_parse && *RExC_parse != ')' )
9516                     RExC_parse++;
9517                 if ( *RExC_parse != ')' )
9518                     vFAIL("Unterminated verb pattern argument");
9519                 if ( RExC_parse == start_arg )
9520                     start_arg = NULL;
9521             } else {
9522                 if ( *RExC_parse != ')' )
9523                     vFAIL("Unterminated verb pattern");
9524             }
9525
9526             switch ( *start_verb ) {
9527             case 'A':  /* (*ACCEPT) */
9528                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9529                     op = ACCEPT;
9530                     internal_argval = RExC_nestroot;
9531                 }
9532                 break;
9533             case 'C':  /* (*COMMIT) */
9534                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9535                     op = COMMIT;
9536                 break;
9537             case 'F':  /* (*FAIL) */
9538                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9539                     op = OPFAIL;
9540                     argok = 0;
9541                 }
9542                 break;
9543             case ':':  /* (*:NAME) */
9544             case 'M':  /* (*MARK:NAME) */
9545                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9546                     op = MARKPOINT;
9547                     argok = -1;
9548                 }
9549                 break;
9550             case 'P':  /* (*PRUNE) */
9551                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9552                     op = PRUNE;
9553                 break;
9554             case 'S':   /* (*SKIP) */
9555                 if ( memEQs(start_verb,verb_len,"SKIP") )
9556                     op = SKIP;
9557                 break;
9558             case 'T':  /* (*THEN) */
9559                 /* [19:06] <TimToady> :: is then */
9560                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9561                     op = CUTGROUP;
9562                     RExC_seen |= REG_CUTGROUP_SEEN;
9563                 }
9564                 break;
9565             }
9566             if ( ! op ) {
9567                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9568                 vFAIL2utf8f(
9569                     "Unknown verb pattern '%"UTF8f"'",
9570                     UTF8fARG(UTF, verb_len, start_verb));
9571             }
9572             if ( argok ) {
9573                 if ( start_arg && internal_argval ) {
9574                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9575                         verb_len, start_verb);
9576                 } else if ( argok < 0 && !start_arg ) {
9577                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9578                         verb_len, start_verb);
9579                 } else {
9580                     ret = reganode(pRExC_state, op, internal_argval);
9581                     if ( ! internal_argval && ! SIZE_ONLY ) {
9582                         if (start_arg) {
9583                             SV *sv = newSVpvn( start_arg,
9584                                                RExC_parse - start_arg);
9585                             ARG(ret) = add_data( pRExC_state,
9586                                                  STR_WITH_LEN("S"));
9587                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9588                             ret->flags = 0;
9589                         } else {
9590                             ret->flags = 1;
9591                         }
9592                     }
9593                 }
9594                 if (!internal_argval)
9595                     RExC_seen |= REG_VERBARG_SEEN;
9596             } else if ( start_arg ) {
9597                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9598                         verb_len, start_verb);
9599             } else {
9600                 ret = reg_node(pRExC_state, op);
9601             }
9602             nextchar(pRExC_state);
9603             return ret;
9604         }
9605         else if (*RExC_parse == '?') { /* (?...) */
9606             bool is_logical = 0;
9607             const char * const seqstart = RExC_parse;
9608             if (has_intervening_patws) {
9609                 RExC_parse++;
9610                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9611             }
9612
9613             RExC_parse++;
9614             paren = *RExC_parse++;
9615             ret = NULL;                 /* For look-ahead/behind. */
9616             switch (paren) {
9617
9618             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9619                 paren = *RExC_parse++;
9620                 if ( paren == '<')         /* (?P<...>) named capture */
9621                     goto named_capture;
9622                 else if (paren == '>') {   /* (?P>name) named recursion */
9623                     goto named_recursion;
9624                 }
9625                 else if (paren == '=') {   /* (?P=...)  named backref */
9626                     /* this pretty much dupes the code for \k<NAME> in
9627                      * regatom(), if you change this make sure you change that
9628                      * */
9629                     char* name_start = RExC_parse;
9630                     U32 num = 0;
9631                     SV *sv_dat = reg_scan_name(pRExC_state,
9632                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9633                     if (RExC_parse == name_start || *RExC_parse != ')')
9634                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9635                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9636
9637                     if (!SIZE_ONLY) {
9638                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9639                         RExC_rxi->data->data[num]=(void*)sv_dat;
9640                         SvREFCNT_inc_simple_void(sv_dat);
9641                     }
9642                     RExC_sawback = 1;
9643                     ret = reganode(pRExC_state,
9644                                    ((! FOLD)
9645                                      ? NREF
9646                                      : (ASCII_FOLD_RESTRICTED)
9647                                        ? NREFFA
9648                                        : (AT_LEAST_UNI_SEMANTICS)
9649                                          ? NREFFU
9650                                          : (LOC)
9651                                            ? NREFFL
9652                                            : NREFF),
9653                                     num);
9654                     *flagp |= HASWIDTH;
9655
9656                     Set_Node_Offset(ret, parse_start+1);
9657                     Set_Node_Cur_Length(ret, parse_start);
9658
9659                     nextchar(pRExC_state);
9660                     return ret;
9661                 }
9662                 RExC_parse++;
9663                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9664                 vFAIL3("Sequence (%.*s...) not recognized",
9665                                 RExC_parse-seqstart, seqstart);
9666                 /*NOTREACHED*/
9667             case '<':           /* (?<...) */
9668                 if (*RExC_parse == '!')
9669                     paren = ',';
9670                 else if (*RExC_parse != '=')
9671               named_capture:
9672                 {               /* (?<...>) */
9673                     char *name_start;
9674                     SV *svname;
9675                     paren= '>';
9676             case '\'':          /* (?'...') */
9677                     name_start= RExC_parse;
9678                     svname = reg_scan_name(pRExC_state,
9679                         SIZE_ONLY    /* reverse test from the others */
9680                         ? REG_RSN_RETURN_NAME
9681                         : REG_RSN_RETURN_NULL);
9682                     if (RExC_parse == name_start || *RExC_parse != paren)
9683                         vFAIL2("Sequence (?%c... not terminated",
9684                             paren=='>' ? '<' : paren);
9685                     if (SIZE_ONLY) {
9686                         HE *he_str;
9687                         SV *sv_dat = NULL;
9688                         if (!svname) /* shouldn't happen */
9689                             Perl_croak(aTHX_
9690                                 "panic: reg_scan_name returned NULL");
9691                         if (!RExC_paren_names) {
9692                             RExC_paren_names= newHV();
9693                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9694 #ifdef DEBUGGING
9695                             RExC_paren_name_list= newAV();
9696                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9697 #endif
9698                         }
9699                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9700                         if ( he_str )
9701                             sv_dat = HeVAL(he_str);
9702                         if ( ! sv_dat ) {
9703                             /* croak baby croak */
9704                             Perl_croak(aTHX_
9705                                 "panic: paren_name hash element allocation failed");
9706                         } else if ( SvPOK(sv_dat) ) {
9707                             /* (?|...) can mean we have dupes so scan to check
9708                                its already been stored. Maybe a flag indicating
9709                                we are inside such a construct would be useful,
9710                                but the arrays are likely to be quite small, so
9711                                for now we punt -- dmq */
9712                             IV count = SvIV(sv_dat);
9713                             I32 *pv = (I32*)SvPVX(sv_dat);
9714                             IV i;
9715                             for ( i = 0 ; i < count ; i++ ) {
9716                                 if ( pv[i] == RExC_npar ) {
9717                                     count = 0;
9718                                     break;
9719                                 }
9720                             }
9721                             if ( count ) {
9722                                 pv = (I32*)SvGROW(sv_dat,
9723                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9724                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9725                                 pv[count] = RExC_npar;
9726                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9727                             }
9728                         } else {
9729                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9730                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9731                                                                 sizeof(I32));
9732                             SvIOK_on(sv_dat);
9733                             SvIV_set(sv_dat, 1);
9734                         }
9735 #ifdef DEBUGGING
9736                         /* Yes this does cause a memory leak in debugging Perls
9737                          * */
9738                         if (!av_store(RExC_paren_name_list,
9739                                       RExC_npar, SvREFCNT_inc(svname)))
9740                             SvREFCNT_dec_NN(svname);
9741 #endif
9742
9743                         /*sv_dump(sv_dat);*/
9744                     }
9745                     nextchar(pRExC_state);
9746                     paren = 1;
9747                     goto capturing_parens;
9748                 }
9749                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9750                 RExC_in_lookbehind++;
9751                 RExC_parse++;
9752                 /* FALLTHROUGH */
9753             case '=':           /* (?=...) */
9754                 RExC_seen_zerolen++;
9755                 break;
9756             case '!':           /* (?!...) */
9757                 RExC_seen_zerolen++;
9758                 if (*RExC_parse == ')') {
9759                     ret=reg_node(pRExC_state, OPFAIL);
9760                     nextchar(pRExC_state);
9761                     return ret;
9762                 }
9763                 break;
9764             case '|':           /* (?|...) */
9765                 /* branch reset, behave like a (?:...) except that
9766                    buffers in alternations share the same numbers */
9767                 paren = ':';
9768                 after_freeze = freeze_paren = RExC_npar;
9769                 break;
9770             case ':':           /* (?:...) */
9771             case '>':           /* (?>...) */
9772                 break;
9773             case '$':           /* (?$...) */
9774             case '@':           /* (?@...) */
9775                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9776                 break;
9777             case '0' :           /* (?0) */
9778             case 'R' :           /* (?R) */
9779                 if (*RExC_parse != ')')
9780                     FAIL("Sequence (?R) not terminated");
9781                 ret = reg_node(pRExC_state, GOSTART);
9782                     RExC_seen |= REG_GOSTART_SEEN;
9783                 *flagp |= POSTPONED;
9784                 nextchar(pRExC_state);
9785                 return ret;
9786                 /*notreached*/
9787             /* named and numeric backreferences */
9788             case '&':            /* (?&NAME) */
9789                 parse_start = RExC_parse - 1;
9790               named_recursion:
9791                 {
9792                     SV *sv_dat = reg_scan_name(pRExC_state,
9793                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9794                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9795                 }
9796                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9797                     vFAIL("Sequence (?&... not terminated");
9798                 goto gen_recurse_regop;
9799                 assert(0); /* NOT REACHED */
9800             case '+':
9801                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9802                     RExC_parse++;
9803                     vFAIL("Illegal pattern");
9804                 }
9805                 goto parse_recursion;
9806                 /* NOT REACHED*/
9807             case '-': /* (?-1) */
9808                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9809                     RExC_parse--; /* rewind to let it be handled later */
9810                     goto parse_flags;
9811                 }
9812                 /* FALLTHROUGH */
9813             case '1': case '2': case '3': case '4': /* (?1) */
9814             case '5': case '6': case '7': case '8': case '9':
9815                 RExC_parse--;
9816               parse_recursion:
9817                 num = atoi(RExC_parse);
9818                 parse_start = RExC_parse - 1; /* MJD */
9819                 if (*RExC_parse == '-')
9820                     RExC_parse++;
9821                 while (isDIGIT(*RExC_parse))
9822                         RExC_parse++;
9823                 if (*RExC_parse!=')')
9824                     vFAIL("Expecting close bracket");
9825
9826               gen_recurse_regop:
9827                 if ( paren == '-' ) {
9828                     /*
9829                     Diagram of capture buffer numbering.
9830                     Top line is the normal capture buffer numbers
9831                     Bottom line is the negative indexing as from
9832                     the X (the (?-2))
9833
9834                     +   1 2    3 4 5 X          6 7
9835                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9836                     -   5 4    3 2 1 X          x x
9837
9838                     */
9839                     num = RExC_npar + num;
9840                     if (num < 1)  {
9841                         RExC_parse++;
9842                         vFAIL("Reference to nonexistent group");
9843                     }
9844                 } else if ( paren == '+' ) {
9845                     num = RExC_npar + num - 1;
9846                 }
9847
9848                 ret = reganode(pRExC_state, GOSUB, num);
9849                 if (!SIZE_ONLY) {
9850                     if (num > (I32)RExC_rx->nparens) {
9851                         RExC_parse++;
9852                         vFAIL("Reference to nonexistent group");
9853                     }
9854                     ARG2L_SET( ret, RExC_recurse_count++);
9855                     RExC_emit++;
9856                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9857                         "Recurse #%"UVuf" to %"IVdf"\n",
9858                               (UV)ARG(ret), (IV)ARG2L(ret)));
9859                 } else {
9860                     RExC_size++;
9861                 }
9862                     RExC_seen |= REG_RECURSE_SEEN;
9863                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9864                 Set_Node_Offset(ret, parse_start); /* MJD */
9865
9866                 *flagp |= POSTPONED;
9867                 nextchar(pRExC_state);
9868                 return ret;
9869
9870             assert(0); /* NOT REACHED */
9871
9872             case '?':           /* (??...) */
9873                 is_logical = 1;
9874                 if (*RExC_parse != '{') {
9875                     RExC_parse++;
9876                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9877                     vFAIL2utf8f(
9878                         "Sequence (%"UTF8f"...) not recognized",
9879                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9880                     /*NOTREACHED*/
9881                 }
9882                 *flagp |= POSTPONED;
9883                 paren = *RExC_parse++;
9884                 /* FALLTHROUGH */
9885             case '{':           /* (?{...}) */
9886             {
9887                 U32 n = 0;
9888                 struct reg_code_block *cb;
9889
9890                 RExC_seen_zerolen++;
9891
9892                 if (   !pRExC_state->num_code_blocks
9893                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9894                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9895                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9896                             - RExC_start)
9897                 ) {
9898                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9899                         FAIL("panic: Sequence (?{...}): no code block found\n");
9900                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9901                 }
9902                 /* this is a pre-compiled code block (?{...}) */
9903                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9904                 RExC_parse = RExC_start + cb->end;
9905                 if (!SIZE_ONLY) {
9906                     OP *o = cb->block;
9907                     if (cb->src_regex) {
9908                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9909                         RExC_rxi->data->data[n] =
9910                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9911                         RExC_rxi->data->data[n+1] = (void*)o;
9912                     }
9913                     else {
9914                         n = add_data(pRExC_state,
9915                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9916                         RExC_rxi->data->data[n] = (void*)o;
9917                     }
9918                 }
9919                 pRExC_state->code_index++;
9920                 nextchar(pRExC_state);
9921
9922                 if (is_logical) {
9923                     regnode *eval;
9924                     ret = reg_node(pRExC_state, LOGICAL);
9925                     eval = reganode(pRExC_state, EVAL, n);
9926                     if (!SIZE_ONLY) {
9927                         ret->flags = 2;
9928                         /* for later propagation into (??{}) return value */
9929                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9930                     }
9931                     REGTAIL(pRExC_state, ret, eval);
9932                     /* deal with the length of this later - MJD */
9933                     return ret;
9934                 }
9935                 ret = reganode(pRExC_state, EVAL, n);
9936                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9937                 Set_Node_Offset(ret, parse_start);
9938                 return ret;
9939             }
9940             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9941             {
9942                 int is_define= 0;
9943                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9944                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9945                         || RExC_parse[1] == '<'
9946                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9947                         I32 flag;
9948                         regnode *tail;
9949
9950                         ret = reg_node(pRExC_state, LOGICAL);
9951                         if (!SIZE_ONLY)
9952                             ret->flags = 1;
9953
9954                         tail = reg(pRExC_state, 1, &flag, depth+1);
9955                         if (flag & RESTART_UTF8) {
9956                             *flagp = RESTART_UTF8;
9957                             return NULL;
9958                         }
9959                         REGTAIL(pRExC_state, ret, tail);
9960                         goto insert_if;
9961                     }
9962                 }
9963                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9964                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9965                 {
9966                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9967                     char *name_start= RExC_parse++;
9968                     U32 num = 0;
9969                     SV *sv_dat=reg_scan_name(pRExC_state,
9970                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9971                     if (RExC_parse == name_start || *RExC_parse != ch)
9972                         vFAIL2("Sequence (?(%c... not terminated",
9973                             (ch == '>' ? '<' : ch));
9974                     RExC_parse++;
9975                     if (!SIZE_ONLY) {
9976                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9977                         RExC_rxi->data->data[num]=(void*)sv_dat;
9978                         SvREFCNT_inc_simple_void(sv_dat);
9979                     }
9980                     ret = reganode(pRExC_state,NGROUPP,num);
9981                     goto insert_if_check_paren;
9982                 }
9983                 else if (RExC_parse[0] == 'D' &&
9984                          RExC_parse[1] == 'E' &&
9985                          RExC_parse[2] == 'F' &&
9986                          RExC_parse[3] == 'I' &&
9987                          RExC_parse[4] == 'N' &&
9988                          RExC_parse[5] == 'E')
9989                 {
9990                     ret = reganode(pRExC_state,DEFINEP,0);
9991                     RExC_parse +=6 ;
9992                     is_define = 1;
9993                     goto insert_if_check_paren;
9994                 }
9995                 else if (RExC_parse[0] == 'R') {
9996                     RExC_parse++;
9997                     parno = 0;
9998                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9999                         parno = atoi(RExC_parse++);
10000                         while (isDIGIT(*RExC_parse))
10001                             RExC_parse++;
10002                     } else if (RExC_parse[0] == '&') {
10003                         SV *sv_dat;
10004                         RExC_parse++;
10005                         sv_dat = reg_scan_name(pRExC_state,
10006                             SIZE_ONLY
10007                             ? REG_RSN_RETURN_NULL
10008                             : REG_RSN_RETURN_DATA);
10009                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10010                     }
10011                     ret = reganode(pRExC_state,INSUBP,parno);
10012                     goto insert_if_check_paren;
10013                 }
10014                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10015                     /* (?(1)...) */
10016                     char c;
10017                     char *tmp;
10018                     parno = atoi(RExC_parse++);
10019
10020                     while (isDIGIT(*RExC_parse))
10021                         RExC_parse++;
10022                     ret = reganode(pRExC_state, GROUPP, parno);
10023
10024                  insert_if_check_paren:
10025                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10026                         /* nextchar also skips comments, so undo its work
10027                          * and skip over the the next character.
10028                          */
10029                         RExC_parse = tmp;
10030                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10031                         vFAIL("Switch condition not recognized");
10032                     }
10033                   insert_if:
10034                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10035                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10036                     if (br == NULL) {
10037                         if (flags & RESTART_UTF8) {
10038                             *flagp = RESTART_UTF8;
10039                             return NULL;
10040                         }
10041                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10042                               (UV) flags);
10043                     } else
10044                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10045                                                           LONGJMP, 0));
10046                     c = *nextchar(pRExC_state);
10047                     if (flags&HASWIDTH)
10048                         *flagp |= HASWIDTH;
10049                     if (c == '|') {
10050                         if (is_define)
10051                             vFAIL("(?(DEFINE)....) does not allow branches");
10052
10053                         /* Fake one for optimizer.  */
10054                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10055
10056                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10057                             if (flags & RESTART_UTF8) {
10058                                 *flagp = RESTART_UTF8;
10059                                 return NULL;
10060                             }
10061                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10062                                   (UV) flags);
10063                         }
10064                         REGTAIL(pRExC_state, ret, lastbr);
10065                         if (flags&HASWIDTH)
10066                             *flagp |= HASWIDTH;
10067                         c = *nextchar(pRExC_state);
10068                     }
10069                     else
10070                         lastbr = NULL;
10071                     if (c != ')')
10072                         vFAIL("Switch (?(condition)... contains too many branches");
10073                     ender = reg_node(pRExC_state, TAIL);
10074                     REGTAIL(pRExC_state, br, ender);
10075                     if (lastbr) {
10076                         REGTAIL(pRExC_state, lastbr, ender);
10077                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10078                     }
10079                     else
10080                         REGTAIL(pRExC_state, ret, ender);
10081                     RExC_size++; /* XXX WHY do we need this?!!
10082                                     For large programs it seems to be required
10083                                     but I can't figure out why. -- dmq*/
10084                     return ret;
10085                 }
10086                 else {
10087                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10088                     vFAIL("Unknown switch condition (?(...))");
10089                 }
10090             }
10091             case '[':           /* (?[ ... ]) */
10092                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10093                                          oregcomp_parse);
10094             case 0:
10095                 RExC_parse--; /* for vFAIL to print correctly */
10096                 vFAIL("Sequence (? incomplete");
10097                 break;
10098             default: /* e.g., (?i) */
10099                 --RExC_parse;
10100               parse_flags:
10101                 parse_lparen_question_flags(pRExC_state);
10102                 if (UCHARAT(RExC_parse) != ':') {
10103                     nextchar(pRExC_state);
10104                     *flagp = TRYAGAIN;
10105                     return NULL;
10106                 }
10107                 paren = ':';
10108                 nextchar(pRExC_state);
10109                 ret = NULL;
10110                 goto parse_rest;
10111             } /* end switch */
10112         }
10113         else {                  /* (...) */
10114           capturing_parens:
10115             parno = RExC_npar;
10116             RExC_npar++;
10117
10118             ret = reganode(pRExC_state, OPEN, parno);
10119             if (!SIZE_ONLY ){
10120                 if (!RExC_nestroot)
10121                     RExC_nestroot = parno;
10122                 if (RExC_seen & REG_RECURSE_SEEN
10123                     && !RExC_open_parens[parno-1])
10124                 {
10125                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10126                         "Setting open paren #%"IVdf" to %d\n",
10127                         (IV)parno, REG_NODE_NUM(ret)));
10128                     RExC_open_parens[parno-1]= ret;
10129                 }
10130             }
10131             Set_Node_Length(ret, 1); /* MJD */
10132             Set_Node_Offset(ret, RExC_parse); /* MJD */
10133             is_open = 1;
10134         }
10135     }
10136     else                        /* ! paren */
10137         ret = NULL;
10138
10139    parse_rest:
10140     /* Pick up the branches, linking them together. */
10141     parse_start = RExC_parse;   /* MJD */
10142     br = regbranch(pRExC_state, &flags, 1,depth+1);
10143
10144     /*     branch_len = (paren != 0); */
10145
10146     if (br == NULL) {
10147         if (flags & RESTART_UTF8) {
10148             *flagp = RESTART_UTF8;
10149             return NULL;
10150         }
10151         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10152     }
10153     if (*RExC_parse == '|') {
10154         if (!SIZE_ONLY && RExC_extralen) {
10155             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10156         }
10157         else {                  /* MJD */
10158             reginsert(pRExC_state, BRANCH, br, depth+1);
10159             Set_Node_Length(br, paren != 0);
10160             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10161         }
10162         have_branch = 1;
10163         if (SIZE_ONLY)
10164             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10165     }
10166     else if (paren == ':') {
10167         *flagp |= flags&SIMPLE;
10168     }
10169     if (is_open) {                              /* Starts with OPEN. */
10170         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10171     }
10172     else if (paren != '?')              /* Not Conditional */
10173         ret = br;
10174     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10175     lastbr = br;
10176     while (*RExC_parse == '|') {
10177         if (!SIZE_ONLY && RExC_extralen) {
10178             ender = reganode(pRExC_state, LONGJMP,0);
10179
10180             /* Append to the previous. */
10181             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10182         }
10183         if (SIZE_ONLY)
10184             RExC_extralen += 2;         /* Account for LONGJMP. */
10185         nextchar(pRExC_state);
10186         if (freeze_paren) {
10187             if (RExC_npar > after_freeze)
10188                 after_freeze = RExC_npar;
10189             RExC_npar = freeze_paren;
10190         }
10191         br = regbranch(pRExC_state, &flags, 0, depth+1);
10192
10193         if (br == NULL) {
10194             if (flags & RESTART_UTF8) {
10195                 *flagp = RESTART_UTF8;
10196                 return NULL;
10197             }
10198             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10199         }
10200         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10201         lastbr = br;
10202         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10203     }
10204
10205     if (have_branch || paren != ':') {
10206         /* Make a closing node, and hook it on the end. */
10207         switch (paren) {
10208         case ':':
10209             ender = reg_node(pRExC_state, TAIL);
10210             break;
10211         case 1: case 2:
10212             ender = reganode(pRExC_state, CLOSE, parno);
10213             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10214                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10215                         "Setting close paren #%"IVdf" to %d\n",
10216                         (IV)parno, REG_NODE_NUM(ender)));
10217                 RExC_close_parens[parno-1]= ender;
10218                 if (RExC_nestroot == parno)
10219                     RExC_nestroot = 0;
10220             }
10221             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10222             Set_Node_Length(ender,1); /* MJD */
10223             break;
10224         case '<':
10225         case ',':
10226         case '=':
10227         case '!':
10228             *flagp &= ~HASWIDTH;
10229             /* FALLTHROUGH */
10230         case '>':
10231             ender = reg_node(pRExC_state, SUCCEED);
10232             break;
10233         case 0:
10234             ender = reg_node(pRExC_state, END);
10235             if (!SIZE_ONLY) {
10236                 assert(!RExC_opend); /* there can only be one! */
10237                 RExC_opend = ender;
10238             }
10239             break;
10240         }
10241         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10242             SV * const mysv_val1=sv_newmortal();
10243             SV * const mysv_val2=sv_newmortal();
10244             DEBUG_PARSE_MSG("lsbr");
10245             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10246             regprop(RExC_rx, mysv_val2, ender, NULL);
10247             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10248                           SvPV_nolen_const(mysv_val1),
10249                           (IV)REG_NODE_NUM(lastbr),
10250                           SvPV_nolen_const(mysv_val2),
10251                           (IV)REG_NODE_NUM(ender),
10252                           (IV)(ender - lastbr)
10253             );
10254         });
10255         REGTAIL(pRExC_state, lastbr, ender);
10256
10257         if (have_branch && !SIZE_ONLY) {
10258             char is_nothing= 1;
10259             if (depth==1)
10260                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10261
10262             /* Hook the tails of the branches to the closing node. */
10263             for (br = ret; br; br = regnext(br)) {
10264                 const U8 op = PL_regkind[OP(br)];
10265                 if (op == BRANCH) {
10266                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10267                     if ( OP(NEXTOPER(br)) != NOTHING
10268                          || regnext(NEXTOPER(br)) != ender)
10269                         is_nothing= 0;
10270                 }
10271                 else if (op == BRANCHJ) {
10272                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10273                     /* for now we always disable this optimisation * /
10274                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10275                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10276                     */
10277                         is_nothing= 0;
10278                 }
10279             }
10280             if (is_nothing) {
10281                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10282                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10283                     SV * const mysv_val1=sv_newmortal();
10284                     SV * const mysv_val2=sv_newmortal();
10285                     DEBUG_PARSE_MSG("NADA");
10286                     regprop(RExC_rx, mysv_val1, ret, NULL);
10287                     regprop(RExC_rx, mysv_val2, ender, NULL);
10288                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10289                                   SvPV_nolen_const(mysv_val1),
10290                                   (IV)REG_NODE_NUM(ret),
10291                                   SvPV_nolen_const(mysv_val2),
10292                                   (IV)REG_NODE_NUM(ender),
10293                                   (IV)(ender - ret)
10294                     );
10295                 });
10296                 OP(br)= NOTHING;
10297                 if (OP(ender) == TAIL) {
10298                     NEXT_OFF(br)= 0;
10299                     RExC_emit= br + 1;
10300                 } else {
10301                     regnode *opt;
10302                     for ( opt= br + 1; opt < ender ; opt++ )
10303                         OP(opt)= OPTIMIZED;
10304                     NEXT_OFF(br)= ender - br;
10305                 }
10306             }
10307         }
10308     }
10309
10310     {
10311         const char *p;
10312         static const char parens[] = "=!<,>";
10313
10314         if (paren && (p = strchr(parens, paren))) {
10315             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10316             int flag = (p - parens) > 1;
10317
10318             if (paren == '>')
10319                 node = SUSPEND, flag = 0;
10320             reginsert(pRExC_state, node,ret, depth+1);
10321             Set_Node_Cur_Length(ret, parse_start);
10322             Set_Node_Offset(ret, parse_start + 1);
10323             ret->flags = flag;
10324             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10325         }
10326     }
10327
10328     /* Check for proper termination. */
10329     if (paren) {
10330         /* restore original flags, but keep (?p) */
10331         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10332         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10333             RExC_parse = oregcomp_parse;
10334             vFAIL("Unmatched (");
10335         }
10336     }
10337     else if (!paren && RExC_parse < RExC_end) {
10338         if (*RExC_parse == ')') {
10339             RExC_parse++;
10340             vFAIL("Unmatched )");
10341         }
10342         else
10343             FAIL("Junk on end of regexp");      /* "Can't happen". */
10344         assert(0); /* NOTREACHED */
10345     }
10346
10347     if (RExC_in_lookbehind) {
10348         RExC_in_lookbehind--;
10349     }
10350     if (after_freeze > RExC_npar)
10351         RExC_npar = after_freeze;
10352     return(ret);
10353 }
10354
10355 /*
10356  - regbranch - one alternative of an | operator
10357  *
10358  * Implements the concatenation operator.
10359  *
10360  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10361  * restarted.
10362  */
10363 STATIC regnode *
10364 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10365 {
10366     regnode *ret;
10367     regnode *chain = NULL;
10368     regnode *latest;
10369     I32 flags = 0, c = 0;
10370     GET_RE_DEBUG_FLAGS_DECL;
10371
10372     PERL_ARGS_ASSERT_REGBRANCH;
10373
10374     DEBUG_PARSE("brnc");
10375
10376     if (first)
10377         ret = NULL;
10378     else {
10379         if (!SIZE_ONLY && RExC_extralen)
10380             ret = reganode(pRExC_state, BRANCHJ,0);
10381         else {
10382             ret = reg_node(pRExC_state, BRANCH);
10383             Set_Node_Length(ret, 1);
10384         }
10385     }
10386
10387     if (!first && SIZE_ONLY)
10388         RExC_extralen += 1;                     /* BRANCHJ */
10389
10390     *flagp = WORST;                     /* Tentatively. */
10391
10392     RExC_parse--;
10393     nextchar(pRExC_state);
10394     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10395         flags &= ~TRYAGAIN;
10396         latest = regpiece(pRExC_state, &flags,depth+1);
10397         if (latest == NULL) {
10398             if (flags & TRYAGAIN)
10399                 continue;
10400             if (flags & RESTART_UTF8) {
10401                 *flagp = RESTART_UTF8;
10402                 return NULL;
10403             }
10404             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10405         }
10406         else if (ret == NULL)
10407             ret = latest;
10408         *flagp |= flags&(HASWIDTH|POSTPONED);
10409         if (chain == NULL)      /* First piece. */
10410             *flagp |= flags&SPSTART;
10411         else {
10412             RExC_naughty++;
10413             REGTAIL(pRExC_state, chain, latest);
10414         }
10415         chain = latest;
10416         c++;
10417     }
10418     if (chain == NULL) {        /* Loop ran zero times. */
10419         chain = reg_node(pRExC_state, NOTHING);
10420         if (ret == NULL)
10421             ret = chain;
10422     }
10423     if (c == 1) {
10424         *flagp |= flags&SIMPLE;
10425     }
10426
10427     return ret;
10428 }
10429
10430 /*
10431  - regpiece - something followed by possible [*+?]
10432  *
10433  * Note that the branching code sequences used for ? and the general cases
10434  * of * and + are somewhat optimized:  they use the same NOTHING node as
10435  * both the endmarker for their branch list and the body of the last branch.
10436  * It might seem that this node could be dispensed with entirely, but the
10437  * endmarker role is not redundant.
10438  *
10439  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10440  * TRYAGAIN.
10441  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10442  * restarted.
10443  */
10444 STATIC regnode *
10445 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10446 {
10447     regnode *ret;
10448     char op;
10449     char *next;
10450     I32 flags;
10451     const char * const origparse = RExC_parse;
10452     I32 min;
10453     I32 max = REG_INFTY;
10454 #ifdef RE_TRACK_PATTERN_OFFSETS
10455     char *parse_start;
10456 #endif
10457     const char *maxpos = NULL;
10458
10459     /* Save the original in case we change the emitted regop to a FAIL. */
10460     regnode * const orig_emit = RExC_emit;
10461
10462     GET_RE_DEBUG_FLAGS_DECL;
10463
10464     PERL_ARGS_ASSERT_REGPIECE;
10465
10466     DEBUG_PARSE("piec");
10467
10468     ret = regatom(pRExC_state, &flags,depth+1);
10469     if (ret == NULL) {
10470         if (flags & (TRYAGAIN|RESTART_UTF8))
10471             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10472         else
10473             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10474         return(NULL);
10475     }
10476
10477     op = *RExC_parse;
10478
10479     if (op == '{' && regcurly(RExC_parse)) {
10480         maxpos = NULL;
10481 #ifdef RE_TRACK_PATTERN_OFFSETS
10482         parse_start = RExC_parse; /* MJD */
10483 #endif
10484         next = RExC_parse + 1;
10485         while (isDIGIT(*next) || *next == ',') {
10486             if (*next == ',') {
10487                 if (maxpos)
10488                     break;
10489                 else
10490                     maxpos = next;
10491             }
10492             next++;
10493         }
10494         if (*next == '}') {             /* got one */
10495             if (!maxpos)
10496                 maxpos = next;
10497             RExC_parse++;
10498             min = atoi(RExC_parse);
10499             if (*maxpos == ',')
10500                 maxpos++;
10501             else
10502                 maxpos = RExC_parse;
10503             max = atoi(maxpos);
10504             if (!max && *maxpos != '0')
10505                 max = REG_INFTY;                /* meaning "infinity" */
10506             else if (max >= REG_INFTY)
10507                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10508             RExC_parse = next;
10509             nextchar(pRExC_state);
10510             if (max < min) {    /* If can't match, warn and optimize to fail
10511                                    unconditionally */
10512                 if (SIZE_ONLY) {
10513                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10514
10515                     /* We can't back off the size because we have to reserve
10516                      * enough space for all the things we are about to throw
10517                      * away, but we can shrink it by the ammount we are about
10518                      * to re-use here */
10519                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10520                 }
10521                 else {
10522                     RExC_emit = orig_emit;
10523                 }
10524                 ret = reg_node(pRExC_state, OPFAIL);
10525                 return ret;
10526             }
10527             else if (min == max
10528                      && RExC_parse < RExC_end
10529                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10530             {
10531                 if (SIZE_ONLY) {
10532                     ckWARN2reg(RExC_parse + 1,
10533                                "Useless use of greediness modifier '%c'",
10534                                *RExC_parse);
10535                 }
10536                 /* Absorb the modifier, so later code doesn't see nor use
10537                     * it */
10538                 nextchar(pRExC_state);
10539             }
10540
10541         do_curly:
10542             if ((flags&SIMPLE)) {
10543                 RExC_naughty += 2 + RExC_naughty / 2;
10544                 reginsert(pRExC_state, CURLY, ret, depth+1);
10545                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10546                 Set_Node_Cur_Length(ret, parse_start);
10547             }
10548             else {
10549                 regnode * const w = reg_node(pRExC_state, WHILEM);
10550
10551                 w->flags = 0;
10552                 REGTAIL(pRExC_state, ret, w);
10553                 if (!SIZE_ONLY && RExC_extralen) {
10554                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10555                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10556                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10557                 }
10558                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10559                                 /* MJD hk */
10560                 Set_Node_Offset(ret, parse_start+1);
10561                 Set_Node_Length(ret,
10562                                 op == '{' ? (RExC_parse - parse_start) : 1);
10563
10564                 if (!SIZE_ONLY && RExC_extralen)
10565                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10566                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10567                 if (SIZE_ONLY)
10568                     RExC_whilem_seen++, RExC_extralen += 3;
10569                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10570             }
10571             ret->flags = 0;
10572
10573             if (min > 0)
10574                 *flagp = WORST;
10575             if (max > 0)
10576                 *flagp |= HASWIDTH;
10577             if (!SIZE_ONLY) {
10578                 ARG1_SET(ret, (U16)min);
10579                 ARG2_SET(ret, (U16)max);
10580             }
10581             if (max == REG_INFTY)
10582                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10583
10584             goto nest_check;
10585         }
10586     }
10587
10588     if (!ISMULT1(op)) {
10589         *flagp = flags;
10590         return(ret);
10591     }
10592
10593 #if 0                           /* Now runtime fix should be reliable. */
10594
10595     /* if this is reinstated, don't forget to put this back into perldiag:
10596
10597             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10598
10599            (F) The part of the regexp subject to either the * or + quantifier
10600            could match an empty string. The {#} shows in the regular
10601            expression about where the problem was discovered.
10602
10603     */
10604
10605     if (!(flags&HASWIDTH) && op != '?')
10606       vFAIL("Regexp *+ operand could be empty");
10607 #endif
10608
10609 #ifdef RE_TRACK_PATTERN_OFFSETS
10610     parse_start = RExC_parse;
10611 #endif
10612     nextchar(pRExC_state);
10613
10614     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10615
10616     if (op == '*' && (flags&SIMPLE)) {
10617         reginsert(pRExC_state, STAR, ret, depth+1);
10618         ret->flags = 0;
10619         RExC_naughty += 4;
10620         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10621     }
10622     else if (op == '*') {
10623         min = 0;
10624         goto do_curly;
10625     }
10626     else if (op == '+' && (flags&SIMPLE)) {
10627         reginsert(pRExC_state, PLUS, ret, depth+1);
10628         ret->flags = 0;
10629         RExC_naughty += 3;
10630         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10631     }
10632     else if (op == '+') {
10633         min = 1;
10634         goto do_curly;
10635     }
10636     else if (op == '?') {
10637         min = 0; max = 1;
10638         goto do_curly;
10639     }
10640   nest_check:
10641     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10642         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10643         ckWARN2reg(RExC_parse,
10644                    "%"UTF8f" matches null string many times",
10645                    UTF8fARG(UTF, (RExC_parse >= origparse
10646                                  ? RExC_parse - origparse
10647                                  : 0),
10648                    origparse));
10649         (void)ReREFCNT_inc(RExC_rx_sv);
10650     }
10651
10652     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10653         nextchar(pRExC_state);
10654         reginsert(pRExC_state, MINMOD, ret, depth+1);
10655         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10656     }
10657     else
10658     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10659         regnode *ender;
10660         nextchar(pRExC_state);
10661         ender = reg_node(pRExC_state, SUCCEED);
10662         REGTAIL(pRExC_state, ret, ender);
10663         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10664         ret->flags = 0;
10665         ender = reg_node(pRExC_state, TAIL);
10666         REGTAIL(pRExC_state, ret, ender);
10667     }
10668
10669     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10670         RExC_parse++;
10671         vFAIL("Nested quantifiers");
10672     }
10673
10674     return(ret);
10675 }
10676
10677 STATIC bool
10678 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10679                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10680                       const bool strict   /* Apply stricter parsing rules? */
10681     )
10682 {
10683
10684  /* This is expected to be called by a parser routine that has recognized '\N'
10685    and needs to handle the rest. RExC_parse is expected to point at the first
10686    char following the N at the time of the call.  On successful return,
10687    RExC_parse has been updated to point to just after the sequence identified
10688    by this routine, and <*flagp> has been updated.
10689
10690    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10691    character class.
10692
10693    \N may begin either a named sequence, or if outside a character class, mean
10694    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10695    attempted to decide which, and in the case of a named sequence, converted it
10696    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10697    where c1... are the characters in the sequence.  For single-quoted regexes,
10698    the tokenizer passes the \N sequence through unchanged; this code will not
10699    attempt to determine this nor expand those, instead raising a syntax error.
10700    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10701    or there is no '}', it signals that this \N occurrence means to match a
10702    non-newline.
10703
10704    Only the \N{U+...} form should occur in a character class, for the same
10705    reason that '.' inside a character class means to just match a period: it
10706    just doesn't make sense.
10707
10708    The function raises an error (via vFAIL), and doesn't return for various
10709    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10710    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10711    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10712    only possible if node_p is non-NULL.
10713
10714
10715    If <valuep> is non-null, it means the caller can accept an input sequence
10716    consisting of a just a single code point; <*valuep> is set to that value
10717    if the input is such.
10718
10719    If <node_p> is non-null it signifies that the caller can accept any other
10720    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10721    is set as follows:
10722     1) \N means not-a-NL: points to a newly created REG_ANY node;
10723     2) \N{}:              points to a new NOTHING node;
10724     3) otherwise:         points to a new EXACT node containing the resolved
10725                           string.
10726    Note that FALSE is returned for single code point sequences if <valuep> is
10727    null.
10728  */
10729
10730     char * endbrace;    /* '}' following the name */
10731     char* p;
10732     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10733                            stream */
10734     bool has_multiple_chars; /* true if the input stream contains a sequence of
10735                                 more than one character */
10736
10737     GET_RE_DEBUG_FLAGS_DECL;
10738
10739     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10740
10741     GET_RE_DEBUG_FLAGS;
10742
10743     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10744
10745     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10746      * modifier.  The other meaning does not, so use a temporary until we find
10747      * out which we are being called with */
10748     p = (RExC_flags & RXf_PMf_EXTENDED)
10749         ? regpatws(pRExC_state, RExC_parse,
10750                                 TRUE) /* means recognize comments */
10751         : RExC_parse;
10752
10753     /* Disambiguate between \N meaning a named character versus \N meaning
10754      * [^\n].  The former is assumed when it can't be the latter. */
10755     if (*p != '{' || regcurly(p)) {
10756         RExC_parse = p;
10757         if (! node_p) {
10758             /* no bare \N allowed in a charclass */
10759             if (in_char_class) {
10760                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10761             }
10762             return FALSE;
10763         }
10764         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10765                            current char */
10766         nextchar(pRExC_state);
10767         *node_p = reg_node(pRExC_state, REG_ANY);
10768         *flagp |= HASWIDTH|SIMPLE;
10769         RExC_naughty++;
10770         Set_Node_Length(*node_p, 1); /* MJD */
10771         return TRUE;
10772     }
10773
10774     /* Here, we have decided it should be a named character or sequence */
10775
10776     /* The test above made sure that the next real character is a '{', but
10777      * under the /x modifier, it could be separated by space (or a comment and
10778      * \n) and this is not allowed (for consistency with \x{...} and the
10779      * tokenizer handling of \N{NAME}). */
10780     if (*RExC_parse != '{') {
10781         vFAIL("Missing braces on \\N{}");
10782     }
10783
10784     RExC_parse++;       /* Skip past the '{' */
10785
10786     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10787         || ! (endbrace == RExC_parse            /* nothing between the {} */
10788               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10789                                                  */
10790                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10791                                                      */
10792     {
10793         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10794         vFAIL("\\N{NAME} must be resolved by the lexer");
10795     }
10796
10797     if (endbrace == RExC_parse) {   /* empty: \N{} */
10798         bool ret = TRUE;
10799         if (node_p) {
10800             *node_p = reg_node(pRExC_state,NOTHING);
10801         }
10802         else if (in_char_class) {
10803             if (SIZE_ONLY && in_char_class) {
10804                 if (strict) {
10805                     RExC_parse++;   /* Position after the "}" */
10806                     vFAIL("Zero length \\N{}");
10807                 }
10808                 else {
10809                     ckWARNreg(RExC_parse,
10810                               "Ignoring zero length \\N{} in character class");
10811                 }
10812             }
10813             ret = FALSE;
10814         }
10815         else {
10816             return FALSE;
10817         }
10818         nextchar(pRExC_state);
10819         return ret;
10820     }
10821
10822     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10823     RExC_parse += 2;    /* Skip past the 'U+' */
10824
10825     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10826
10827     /* Code points are separated by dots.  If none, there is only one code
10828      * point, and is terminated by the brace */
10829     has_multiple_chars = (endchar < endbrace);
10830
10831     if (valuep && (! has_multiple_chars || in_char_class)) {
10832         /* We only pay attention to the first char of
10833         multichar strings being returned in char classes. I kinda wonder
10834         if this makes sense as it does change the behaviour
10835         from earlier versions, OTOH that behaviour was broken
10836         as well. XXX Solution is to recharacterize as
10837         [rest-of-class]|multi1|multi2... */
10838
10839         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10840         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10841             | PERL_SCAN_DISALLOW_PREFIX
10842             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10843
10844         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10845
10846         /* The tokenizer should have guaranteed validity, but it's possible to
10847          * bypass it by using single quoting, so check */
10848         if (length_of_hex == 0
10849             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10850         {
10851             RExC_parse += length_of_hex;        /* Includes all the valid */
10852             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10853                             ? UTF8SKIP(RExC_parse)
10854                             : 1;
10855             /* Guard against malformed utf8 */
10856             if (RExC_parse >= endchar) {
10857                 RExC_parse = endchar;
10858             }
10859             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10860         }
10861
10862         if (in_char_class && has_multiple_chars) {
10863             if (strict) {
10864                 RExC_parse = endbrace;
10865                 vFAIL("\\N{} in character class restricted to one character");
10866             }
10867             else {
10868                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10869             }
10870         }
10871
10872         RExC_parse = endbrace + 1;
10873     }
10874     else if (! node_p || ! has_multiple_chars) {
10875
10876         /* Here, the input is legal, but not according to the caller's
10877          * options.  We fail without advancing the parse, so that the
10878          * caller can try again */
10879         RExC_parse = p;
10880         return FALSE;
10881     }
10882     else {
10883
10884         /* What is done here is to convert this to a sub-pattern of the form
10885          * (?:\x{char1}\x{char2}...)
10886          * and then call reg recursively.  That way, it retains its atomicness,
10887          * while not having to worry about special handling that some code
10888          * points may have.  toke.c has converted the original Unicode values
10889          * to native, so that we can just pass on the hex values unchanged.  We
10890          * do have to set a flag to keep recoding from happening in the
10891          * recursion */
10892
10893         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10894         STRLEN len;
10895         char *orig_end = RExC_end;
10896         I32 flags;
10897
10898         while (RExC_parse < endbrace) {
10899
10900             /* Convert to notation the rest of the code understands */
10901             sv_catpv(substitute_parse, "\\x{");
10902             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10903             sv_catpv(substitute_parse, "}");
10904
10905             /* Point to the beginning of the next character in the sequence. */
10906             RExC_parse = endchar + 1;
10907             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10908         }
10909         sv_catpv(substitute_parse, ")");
10910
10911         RExC_parse = SvPV(substitute_parse, len);
10912
10913         /* Don't allow empty number */
10914         if (len < 8) {
10915             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10916         }
10917         RExC_end = RExC_parse + len;
10918
10919         /* The values are Unicode, and therefore not subject to recoding */
10920         RExC_override_recoding = 1;
10921
10922         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10923             if (flags & RESTART_UTF8) {
10924                 *flagp = RESTART_UTF8;
10925                 return FALSE;
10926             }
10927             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10928                   (UV) flags);
10929         }
10930         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10931
10932         RExC_parse = endbrace;
10933         RExC_end = orig_end;
10934         RExC_override_recoding = 0;
10935
10936         nextchar(pRExC_state);
10937     }
10938
10939     return TRUE;
10940 }
10941
10942
10943 /*
10944  * reg_recode
10945  *
10946  * It returns the code point in utf8 for the value in *encp.
10947  *    value: a code value in the source encoding
10948  *    encp:  a pointer to an Encode object
10949  *
10950  * If the result from Encode is not a single character,
10951  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10952  */
10953 STATIC UV
10954 S_reg_recode(pTHX_ const char value, SV **encp)
10955 {
10956     STRLEN numlen = 1;
10957     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10958     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10959     const STRLEN newlen = SvCUR(sv);
10960     UV uv = UNICODE_REPLACEMENT;
10961
10962     PERL_ARGS_ASSERT_REG_RECODE;
10963
10964     if (newlen)
10965         uv = SvUTF8(sv)
10966              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10967              : *(U8*)s;
10968
10969     if (!newlen || numlen != newlen) {
10970         uv = UNICODE_REPLACEMENT;
10971         *encp = NULL;
10972     }
10973     return uv;
10974 }
10975
10976 PERL_STATIC_INLINE U8
10977 S_compute_EXACTish(RExC_state_t *pRExC_state)
10978 {
10979     U8 op;
10980
10981     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10982
10983     if (! FOLD) {
10984         return EXACT;
10985     }
10986
10987     op = get_regex_charset(RExC_flags);
10988     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10989         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10990                  been, so there is no hole */
10991     }
10992
10993     return op + EXACTF;
10994 }
10995
10996 PERL_STATIC_INLINE void
10997 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10998                          regnode *node, I32* flagp, STRLEN len, UV code_point,
10999                          bool downgradable)
11000 {
11001     /* This knows the details about sizing an EXACTish node, setting flags for
11002      * it (by setting <*flagp>, and potentially populating it with a single
11003      * character.
11004      *
11005      * If <len> (the length in bytes) is non-zero, this function assumes that
11006      * the node has already been populated, and just does the sizing.  In this
11007      * case <code_point> should be the final code point that has already been
11008      * placed into the node.  This value will be ignored except that under some
11009      * circumstances <*flagp> is set based on it.
11010      *
11011      * If <len> is zero, the function assumes that the node is to contain only
11012      * the single character given by <code_point> and calculates what <len>
11013      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11014      * additionally will populate the node's STRING with <code_point> or its
11015      * fold if folding.
11016      *
11017      * In both cases <*flagp> is appropriately set
11018      *
11019      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11020      * 255, must be folded (the former only when the rules indicate it can
11021      * match 'ss')
11022      *
11023      * When it does the populating, it looks at the flag 'downgradable'.  If
11024      * true with a node that folds, it checks if the single code point
11025      * participates in a fold, and if not downgrades the node to an EXACT.
11026      * This helps the optimizer */
11027
11028     bool len_passed_in = cBOOL(len != 0);
11029     U8 character[UTF8_MAXBYTES_CASE+1];
11030
11031     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11032
11033     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11034      * sizing difference, and is extra work that is thrown away */
11035     if (downgradable && ! PASS2) {
11036         downgradable = FALSE;
11037     }
11038
11039     if (! len_passed_in) {
11040         if (UTF) {
11041             if (UNI_IS_INVARIANT(code_point)) {
11042                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11043                     *character = (U8) code_point;
11044                 }
11045                 else { /* Here is /i and not /l (toFOLD() is defined on just
11046                           ASCII, which isn't the same thing as INVARIANT on
11047                           EBCDIC, but it works there, as the extra invariants
11048                           fold to themselves) */
11049                     *character = toFOLD((U8) code_point);
11050                     if (downgradable
11051                         && *character == code_point
11052                         && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11053                     {
11054                         OP(node) = EXACT;
11055                     }
11056                 }
11057                 len = 1;
11058             }
11059             else if (FOLD && (! LOC
11060                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11061             {   /* Folding, and ok to do so now */
11062                 UV folded = _to_uni_fold_flags(
11063                                    code_point,
11064                                    character,
11065                                    &len,
11066                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11067                                                       ? FOLD_FLAGS_NOMIX_ASCII
11068                                                       : 0));
11069                 if (downgradable
11070                     && folded == code_point
11071                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11072                 {
11073                     OP(node) = EXACT;
11074                 }
11075             }
11076             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11077
11078                 /* Not folding this cp, and can output it directly */
11079                 *character = UTF8_TWO_BYTE_HI(code_point);
11080                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11081                 len = 2;
11082             }
11083             else {
11084                 uvchr_to_utf8( character, code_point);
11085                 len = UTF8SKIP(character);
11086             }
11087         } /* Else pattern isn't UTF8.  */
11088         else if (! FOLD) {
11089             *character = (U8) code_point;
11090             len = 1;
11091         } /* Else is folded non-UTF8 */
11092         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11093
11094             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11095              * comments at join_exact()); */
11096             *character = (U8) code_point;
11097             len = 1;
11098
11099             /* Can turn into an EXACT node if we know the fold at compile time,
11100              * and it folds to itself and doesn't particpate in other folds */
11101             if (downgradable
11102                 && ! LOC
11103                 && PL_fold_latin1[code_point] == code_point
11104                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11105                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11106             {
11107                 OP(node) = EXACT;
11108             }
11109         } /* else is Sharp s.  May need to fold it */
11110         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11111             *character = 's';
11112             *(character + 1) = 's';
11113             len = 2;
11114         }
11115         else {
11116             *character = LATIN_SMALL_LETTER_SHARP_S;
11117             len = 1;
11118         }
11119     }
11120
11121     if (SIZE_ONLY) {
11122         RExC_size += STR_SZ(len);
11123     }
11124     else {
11125         RExC_emit += STR_SZ(len);
11126         STR_LEN(node) = len;
11127         if (! len_passed_in) {
11128             Copy((char *) character, STRING(node), len, char);
11129         }
11130     }
11131
11132     *flagp |= HASWIDTH;
11133
11134     /* A single character node is SIMPLE, except for the special-cased SHARP S
11135      * under /di. */
11136     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11137         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11138             || ! FOLD || ! DEPENDS_SEMANTICS))
11139     {
11140         *flagp |= SIMPLE;
11141     }
11142
11143     /* The OP may not be well defined in PASS1 */
11144     if (PASS2 && OP(node) == EXACTFL) {
11145         RExC_contains_locale = 1;
11146     }
11147 }
11148
11149
11150 /* return atoi(p), unless it's too big to sensibly be a backref,
11151  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11152
11153 static I32
11154 S_backref_value(char *p)
11155 {
11156     char *q = p;
11157
11158     for (;isDIGIT(*q); q++) {} /* calculate length of num */
11159     if (q - p == 0 || q - p > 9)
11160         return I32_MAX;
11161     return atoi(p);
11162 }
11163
11164
11165 /*
11166  - regatom - the lowest level
11167
11168    Try to identify anything special at the start of the pattern. If there
11169    is, then handle it as required. This may involve generating a single regop,
11170    such as for an assertion; or it may involve recursing, such as to
11171    handle a () structure.
11172
11173    If the string doesn't start with something special then we gobble up
11174    as much literal text as we can.
11175
11176    Once we have been able to handle whatever type of thing started the
11177    sequence, we return.
11178
11179    Note: we have to be careful with escapes, as they can be both literal
11180    and special, and in the case of \10 and friends, context determines which.
11181
11182    A summary of the code structure is:
11183
11184    switch (first_byte) {
11185         cases for each special:
11186             handle this special;
11187             break;
11188         case '\\':
11189             switch (2nd byte) {
11190                 cases for each unambiguous special:
11191                     handle this special;
11192                     break;
11193                 cases for each ambigous special/literal:
11194                     disambiguate;
11195                     if (special)  handle here
11196                     else goto defchar;
11197                 default: // unambiguously literal:
11198                     goto defchar;
11199             }
11200         default:  // is a literal char
11201             // FALL THROUGH
11202         defchar:
11203             create EXACTish node for literal;
11204             while (more input and node isn't full) {
11205                 switch (input_byte) {
11206                    cases for each special;
11207                        make sure parse pointer is set so that the next call to
11208                            regatom will see this special first
11209                        goto loopdone; // EXACTish node terminated by prev. char
11210                    default:
11211                        append char to EXACTISH node;
11212                 }
11213                 get next input byte;
11214             }
11215         loopdone:
11216    }
11217    return the generated node;
11218
11219    Specifically there are two separate switches for handling
11220    escape sequences, with the one for handling literal escapes requiring
11221    a dummy entry for all of the special escapes that are actually handled
11222    by the other.
11223
11224    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11225    TRYAGAIN.
11226    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11227    restarted.
11228    Otherwise does not return NULL.
11229 */
11230
11231 STATIC regnode *
11232 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11233 {
11234     regnode *ret = NULL;
11235     I32 flags = 0;
11236     char *parse_start = RExC_parse;
11237     U8 op;
11238     int invert = 0;
11239     U8 arg;
11240
11241     GET_RE_DEBUG_FLAGS_DECL;
11242
11243     *flagp = WORST;             /* Tentatively. */
11244
11245     DEBUG_PARSE("atom");
11246
11247     PERL_ARGS_ASSERT_REGATOM;
11248
11249 tryagain:
11250     switch ((U8)*RExC_parse) {
11251     case '^':
11252         RExC_seen_zerolen++;
11253         nextchar(pRExC_state);
11254         if (RExC_flags & RXf_PMf_MULTILINE)
11255             ret = reg_node(pRExC_state, MBOL);
11256         else if (RExC_flags & RXf_PMf_SINGLELINE)
11257             ret = reg_node(pRExC_state, SBOL);
11258         else
11259             ret = reg_node(pRExC_state, BOL);
11260         Set_Node_Length(ret, 1); /* MJD */
11261         break;
11262     case '$':
11263         nextchar(pRExC_state);
11264         if (*RExC_parse)
11265             RExC_seen_zerolen++;
11266         if (RExC_flags & RXf_PMf_MULTILINE)
11267             ret = reg_node(pRExC_state, MEOL);
11268         else if (RExC_flags & RXf_PMf_SINGLELINE)
11269             ret = reg_node(pRExC_state, SEOL);
11270         else
11271             ret = reg_node(pRExC_state, EOL);
11272         Set_Node_Length(ret, 1); /* MJD */
11273         break;
11274     case '.':
11275         nextchar(pRExC_state);
11276         if (RExC_flags & RXf_PMf_SINGLELINE)
11277             ret = reg_node(pRExC_state, SANY);
11278         else
11279             ret = reg_node(pRExC_state, REG_ANY);
11280         *flagp |= HASWIDTH|SIMPLE;
11281         RExC_naughty++;
11282         Set_Node_Length(ret, 1); /* MJD */
11283         break;
11284     case '[':
11285     {
11286         char * const oregcomp_parse = ++RExC_parse;
11287         ret = regclass(pRExC_state, flagp,depth+1,
11288                        FALSE, /* means parse the whole char class */
11289                        TRUE, /* allow multi-char folds */
11290                        FALSE, /* don't silence non-portable warnings. */
11291                        NULL);
11292         if (*RExC_parse != ']') {
11293             RExC_parse = oregcomp_parse;
11294             vFAIL("Unmatched [");
11295         }
11296         if (ret == NULL) {
11297             if (*flagp & RESTART_UTF8)
11298                 return NULL;
11299             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11300                   (UV) *flagp);
11301         }
11302         nextchar(pRExC_state);
11303         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11304         break;
11305     }
11306     case '(':
11307         nextchar(pRExC_state);
11308         ret = reg(pRExC_state, 2, &flags,depth+1);
11309         if (ret == NULL) {
11310                 if (flags & TRYAGAIN) {
11311                     if (RExC_parse == RExC_end) {
11312                          /* Make parent create an empty node if needed. */
11313                         *flagp |= TRYAGAIN;
11314                         return(NULL);
11315                     }
11316                     goto tryagain;
11317                 }
11318                 if (flags & RESTART_UTF8) {
11319                     *flagp = RESTART_UTF8;
11320                     return NULL;
11321                 }
11322                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11323                                                                  (UV) flags);
11324         }
11325         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11326         break;
11327     case '|':
11328     case ')':
11329         if (flags & TRYAGAIN) {
11330             *flagp |= TRYAGAIN;
11331             return NULL;
11332         }
11333         vFAIL("Internal urp");
11334                                 /* Supposed to be caught earlier. */
11335         break;
11336     case '?':
11337     case '+':
11338     case '*':
11339         RExC_parse++;
11340         vFAIL("Quantifier follows nothing");
11341         break;
11342     case '\\':
11343         /* Special Escapes
11344
11345            This switch handles escape sequences that resolve to some kind
11346            of special regop and not to literal text. Escape sequnces that
11347            resolve to literal text are handled below in the switch marked
11348            "Literal Escapes".
11349
11350            Every entry in this switch *must* have a corresponding entry
11351            in the literal escape switch. However, the opposite is not
11352            required, as the default for this switch is to jump to the
11353            literal text handling code.
11354         */
11355         switch ((U8)*++RExC_parse) {
11356         /* Special Escapes */
11357         case 'A':
11358             RExC_seen_zerolen++;
11359             ret = reg_node(pRExC_state, SBOL);
11360             *flagp |= SIMPLE;
11361             goto finish_meta_pat;
11362         case 'G':
11363             ret = reg_node(pRExC_state, GPOS);
11364             RExC_seen |= REG_GPOS_SEEN;
11365             *flagp |= SIMPLE;
11366             goto finish_meta_pat;
11367         case 'K':
11368             RExC_seen_zerolen++;
11369             ret = reg_node(pRExC_state, KEEPS);
11370             *flagp |= SIMPLE;
11371             /* XXX:dmq : disabling in-place substitution seems to
11372              * be necessary here to avoid cases of memory corruption, as
11373              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11374              */
11375             RExC_seen |= REG_LOOKBEHIND_SEEN;
11376             goto finish_meta_pat;
11377         case 'Z':
11378             ret = reg_node(pRExC_state, SEOL);
11379             *flagp |= SIMPLE;
11380             RExC_seen_zerolen++;                /* Do not optimize RE away */
11381             goto finish_meta_pat;
11382         case 'z':
11383             ret = reg_node(pRExC_state, EOS);
11384             *flagp |= SIMPLE;
11385             RExC_seen_zerolen++;                /* Do not optimize RE away */
11386             goto finish_meta_pat;
11387         case 'C':
11388             ret = reg_node(pRExC_state, CANY);
11389             RExC_seen |= REG_CANY_SEEN;
11390             *flagp |= HASWIDTH|SIMPLE;
11391             if (SIZE_ONLY) {
11392                 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11393             }
11394             goto finish_meta_pat;
11395         case 'X':
11396             ret = reg_node(pRExC_state, CLUMP);
11397             *flagp |= HASWIDTH;
11398             goto finish_meta_pat;
11399
11400         case 'W':
11401             invert = 1;
11402             /* FALLTHROUGH */
11403         case 'w':
11404             arg = ANYOF_WORDCHAR;
11405             goto join_posix;
11406
11407         case 'b':
11408             RExC_seen_zerolen++;
11409             RExC_seen |= REG_LOOKBEHIND_SEEN;
11410             op = BOUND + get_regex_charset(RExC_flags);
11411             if (op > BOUNDA) {  /* /aa is same as /a */
11412                 op = BOUNDA;
11413             }
11414             else if (op == BOUNDL) {
11415                 RExC_contains_locale = 1;
11416             }
11417             ret = reg_node(pRExC_state, op);
11418             FLAGS(ret) = get_regex_charset(RExC_flags);
11419             *flagp |= SIMPLE;
11420             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11421                 /* diag_listed_as: Use "%s" instead of "%s" */
11422                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11423             }
11424             goto finish_meta_pat;
11425         case 'B':
11426             RExC_seen_zerolen++;
11427             RExC_seen |= REG_LOOKBEHIND_SEEN;
11428             op = NBOUND + get_regex_charset(RExC_flags);
11429             if (op > NBOUNDA) { /* /aa is same as /a */
11430                 op = NBOUNDA;
11431             }
11432             else if (op == NBOUNDL) {
11433                 RExC_contains_locale = 1;
11434             }
11435             ret = reg_node(pRExC_state, op);
11436             FLAGS(ret) = get_regex_charset(RExC_flags);
11437             *flagp |= SIMPLE;
11438             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11439                 /* diag_listed_as: Use "%s" instead of "%s" */
11440                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11441             }
11442             goto finish_meta_pat;
11443
11444         case 'D':
11445             invert = 1;
11446             /* FALLTHROUGH */
11447         case 'd':
11448             arg = ANYOF_DIGIT;
11449             goto join_posix;
11450
11451         case 'R':
11452             ret = reg_node(pRExC_state, LNBREAK);
11453             *flagp |= HASWIDTH|SIMPLE;
11454             goto finish_meta_pat;
11455
11456         case 'H':
11457             invert = 1;
11458             /* FALLTHROUGH */
11459         case 'h':
11460             arg = ANYOF_BLANK;
11461             op = POSIXU;
11462             goto join_posix_op_known;
11463
11464         case 'V':
11465             invert = 1;
11466             /* FALLTHROUGH */
11467         case 'v':
11468             arg = ANYOF_VERTWS;
11469             op = POSIXU;
11470             goto join_posix_op_known;
11471
11472         case 'S':
11473             invert = 1;
11474             /* FALLTHROUGH */
11475         case 's':
11476             arg = ANYOF_SPACE;
11477
11478         join_posix:
11479
11480             op = POSIXD + get_regex_charset(RExC_flags);
11481             if (op > POSIXA) {  /* /aa is same as /a */
11482                 op = POSIXA;
11483             }
11484             else if (op == POSIXL) {
11485                 RExC_contains_locale = 1;
11486             }
11487
11488         join_posix_op_known:
11489
11490             if (invert) {
11491                 op += NPOSIXD - POSIXD;
11492             }
11493
11494             ret = reg_node(pRExC_state, op);
11495             if (! SIZE_ONLY) {
11496                 FLAGS(ret) = namedclass_to_classnum(arg);
11497             }
11498
11499             *flagp |= HASWIDTH|SIMPLE;
11500             /* FALLTHROUGH */
11501
11502          finish_meta_pat:
11503             nextchar(pRExC_state);
11504             Set_Node_Length(ret, 2); /* MJD */
11505             break;
11506         case 'p':
11507         case 'P':
11508             {
11509 #ifdef DEBUGGING
11510                 char* parse_start = RExC_parse - 2;
11511 #endif
11512
11513                 RExC_parse--;
11514
11515                 ret = regclass(pRExC_state, flagp,depth+1,
11516                                TRUE, /* means just parse this element */
11517                                FALSE, /* don't allow multi-char folds */
11518                                FALSE, /* don't silence non-portable warnings.
11519                                          It would be a bug if these returned
11520                                          non-portables */
11521                                NULL);
11522                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11523                    are allowed.  */
11524                 if (!ret)
11525                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11526                           (UV) *flagp);
11527
11528                 RExC_parse--;
11529
11530                 Set_Node_Offset(ret, parse_start + 2);
11531                 Set_Node_Cur_Length(ret, parse_start);
11532                 nextchar(pRExC_state);
11533             }
11534             break;
11535         case 'N':
11536             /* Handle \N and \N{NAME} with multiple code points here and not
11537              * below because it can be multicharacter. join_exact() will join
11538              * them up later on.  Also this makes sure that things like
11539              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11540              * The options to the grok function call causes it to fail if the
11541              * sequence is just a single code point.  We then go treat it as
11542              * just another character in the current EXACT node, and hence it
11543              * gets uniform treatment with all the other characters.  The
11544              * special treatment for quantifiers is not needed for such single
11545              * character sequences */
11546             ++RExC_parse;
11547             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11548                                 FALSE /* not strict */ )) {
11549                 if (*flagp & RESTART_UTF8)
11550                     return NULL;
11551                 RExC_parse--;
11552                 goto defchar;
11553             }
11554             break;
11555         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11556         parse_named_seq:
11557         {
11558             char ch= RExC_parse[1];
11559             if (ch != '<' && ch != '\'' && ch != '{') {
11560                 RExC_parse++;
11561                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11562                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11563             } else {
11564                 /* this pretty much dupes the code for (?P=...) in reg(), if
11565                    you change this make sure you change that */
11566                 char* name_start = (RExC_parse += 2);
11567                 U32 num = 0;
11568                 SV *sv_dat = reg_scan_name(pRExC_state,
11569                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11570                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11571                 if (RExC_parse == name_start || *RExC_parse != ch)
11572                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11573                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11574
11575                 if (!SIZE_ONLY) {
11576                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11577                     RExC_rxi->data->data[num]=(void*)sv_dat;
11578                     SvREFCNT_inc_simple_void(sv_dat);
11579                 }
11580
11581                 RExC_sawback = 1;
11582                 ret = reganode(pRExC_state,
11583                                ((! FOLD)
11584                                  ? NREF
11585                                  : (ASCII_FOLD_RESTRICTED)
11586                                    ? NREFFA
11587                                    : (AT_LEAST_UNI_SEMANTICS)
11588                                      ? NREFFU
11589                                      : (LOC)
11590                                        ? NREFFL
11591                                        : NREFF),
11592                                 num);
11593                 *flagp |= HASWIDTH;
11594
11595                 /* override incorrect value set in reganode MJD */
11596                 Set_Node_Offset(ret, parse_start+1);
11597                 Set_Node_Cur_Length(ret, parse_start);
11598                 nextchar(pRExC_state);
11599
11600             }
11601             break;
11602         }
11603         case 'g':
11604         case '1': case '2': case '3': case '4':
11605         case '5': case '6': case '7': case '8': case '9':
11606             {
11607                 I32 num;
11608                 bool hasbrace = 0;
11609
11610                 if (*RExC_parse == 'g') {
11611                     bool isrel = 0;
11612
11613                     RExC_parse++;
11614                     if (*RExC_parse == '{') {
11615                         RExC_parse++;
11616                         hasbrace = 1;
11617                     }
11618                     if (*RExC_parse == '-') {
11619                         RExC_parse++;
11620                         isrel = 1;
11621                     }
11622                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11623                         if (isrel) RExC_parse--;
11624                         RExC_parse -= 2;
11625                         goto parse_named_seq;
11626                     }
11627
11628                     num = S_backref_value(RExC_parse);
11629                     if (num == 0)
11630                         vFAIL("Reference to invalid group 0");
11631                     else if (num == I32_MAX) {
11632                          if (isDIGIT(*RExC_parse))
11633                             vFAIL("Reference to nonexistent group");
11634                         else
11635                             vFAIL("Unterminated \\g... pattern");
11636                     }
11637
11638                     if (isrel) {
11639                         num = RExC_npar - num;
11640                         if (num < 1)
11641                             vFAIL("Reference to nonexistent or unclosed group");
11642                     }
11643                 }
11644                 else {
11645                     num = S_backref_value(RExC_parse);
11646                     /* bare \NNN might be backref or octal - if it is larger than or equal
11647                      * RExC_npar then it is assumed to be and octal escape.
11648                      * Note RExC_npar is +1 from the actual number of parens*/
11649                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11650                             && *RExC_parse != '8' && *RExC_parse != '9'))
11651                     {
11652                         /* Probably a character specified in octal, e.g. \35 */
11653                         goto defchar;
11654                     }
11655                 }
11656
11657                 /* at this point RExC_parse definitely points to a backref
11658                  * number */
11659                 {
11660 #ifdef RE_TRACK_PATTERN_OFFSETS
11661                     char * const parse_start = RExC_parse - 1; /* MJD */
11662 #endif
11663                     while (isDIGIT(*RExC_parse))
11664                         RExC_parse++;
11665                     if (hasbrace) {
11666                         if (*RExC_parse != '}')
11667                             vFAIL("Unterminated \\g{...} pattern");
11668                         RExC_parse++;
11669                     }
11670                     if (!SIZE_ONLY) {
11671                         if (num > (I32)RExC_rx->nparens)
11672                             vFAIL("Reference to nonexistent group");
11673                     }
11674                     RExC_sawback = 1;
11675                     ret = reganode(pRExC_state,
11676                                    ((! FOLD)
11677                                      ? REF
11678                                      : (ASCII_FOLD_RESTRICTED)
11679                                        ? REFFA
11680                                        : (AT_LEAST_UNI_SEMANTICS)
11681                                          ? REFFU
11682                                          : (LOC)
11683                                            ? REFFL
11684                                            : REFF),
11685                                     num);
11686                     *flagp |= HASWIDTH;
11687
11688                     /* override incorrect value set in reganode MJD */
11689                     Set_Node_Offset(ret, parse_start+1);
11690                     Set_Node_Cur_Length(ret, parse_start);
11691                     RExC_parse--;
11692                     nextchar(pRExC_state);
11693                 }
11694             }
11695             break;
11696         case '\0':
11697             if (RExC_parse >= RExC_end)
11698                 FAIL("Trailing \\");
11699             /* FALLTHROUGH */
11700         default:
11701             /* Do not generate "unrecognized" warnings here, we fall
11702                back into the quick-grab loop below */
11703             parse_start--;
11704             goto defchar;
11705         }
11706         break;
11707
11708     case '#':
11709         if (RExC_flags & RXf_PMf_EXTENDED) {
11710             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11711             if (RExC_parse < RExC_end)
11712                 goto tryagain;
11713         }
11714         /* FALLTHROUGH */
11715
11716     default:
11717
11718             parse_start = RExC_parse - 1;
11719
11720             RExC_parse++;
11721
11722         defchar: {
11723             STRLEN len = 0;
11724             UV ender = 0;
11725             char *p;
11726             char *s;
11727 #define MAX_NODE_STRING_SIZE 127
11728             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11729             char *s0;
11730             U8 upper_parse = MAX_NODE_STRING_SIZE;
11731             U8 node_type = compute_EXACTish(pRExC_state);
11732             bool next_is_quantifier;
11733             char * oldp = NULL;
11734
11735             /* We can convert EXACTF nodes to EXACTFU if they contain only
11736              * characters that match identically regardless of the target
11737              * string's UTF8ness.  The reason to do this is that EXACTF is not
11738              * trie-able, EXACTFU is.
11739              *
11740              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11741              * contain only above-Latin1 characters (hence must be in UTF8),
11742              * which don't participate in folds with Latin1-range characters,
11743              * as the latter's folds aren't known until runtime.  (We don't
11744              * need to figure this out until pass 2) */
11745             bool maybe_exactfu = PASS2
11746                                && (node_type == EXACTF || node_type == EXACTFL);
11747
11748             /* If a folding node contains only code points that don't
11749              * participate in folds, it can be changed into an EXACT node,
11750              * which allows the optimizer more things to look for */
11751             bool maybe_exact;
11752
11753             ret = reg_node(pRExC_state, node_type);
11754
11755             /* In pass1, folded, we use a temporary buffer instead of the
11756              * actual node, as the node doesn't exist yet */
11757             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11758
11759             s0 = s;
11760
11761         reparse:
11762
11763             /* We do the EXACTFish to EXACT node only if folding.  (And we
11764              * don't need to figure this out until pass 2) */
11765             maybe_exact = FOLD && PASS2;
11766
11767             /* XXX The node can hold up to 255 bytes, yet this only goes to
11768              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11769              * 255 allows us to not have to worry about overflow due to
11770              * converting to utf8 and fold expansion, but that value is
11771              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11772              * split up by this limit into a single one using the real max of
11773              * 255.  Even at 127, this breaks under rare circumstances.  If
11774              * folding, we do not want to split a node at a character that is a
11775              * non-final in a multi-char fold, as an input string could just
11776              * happen to want to match across the node boundary.  The join
11777              * would solve that problem if the join actually happens.  But a
11778              * series of more than two nodes in a row each of 127 would cause
11779              * the first join to succeed to get to 254, but then there wouldn't
11780              * be room for the next one, which could at be one of those split
11781              * multi-char folds.  I don't know of any fool-proof solution.  One
11782              * could back off to end with only a code point that isn't such a
11783              * non-final, but it is possible for there not to be any in the
11784              * entire node. */
11785             for (p = RExC_parse - 1;
11786                  len < upper_parse && p < RExC_end;
11787                  len++)
11788             {
11789                 oldp = p;
11790
11791                 if (RExC_flags & RXf_PMf_EXTENDED)
11792                     p = regpatws(pRExC_state, p,
11793                                           TRUE); /* means recognize comments */
11794                 switch ((U8)*p) {
11795                 case '^':
11796                 case '$':
11797                 case '.':
11798                 case '[':
11799                 case '(':
11800                 case ')':
11801                 case '|':
11802                     goto loopdone;
11803                 case '\\':
11804                     /* Literal Escapes Switch
11805
11806                        This switch is meant to handle escape sequences that
11807                        resolve to a literal character.
11808
11809                        Every escape sequence that represents something
11810                        else, like an assertion or a char class, is handled
11811                        in the switch marked 'Special Escapes' above in this
11812                        routine, but also has an entry here as anything that
11813                        isn't explicitly mentioned here will be treated as
11814                        an unescaped equivalent literal.
11815                     */
11816
11817                     switch ((U8)*++p) {
11818                     /* These are all the special escapes. */
11819                     case 'A':             /* Start assertion */
11820                     case 'b': case 'B':   /* Word-boundary assertion*/
11821                     case 'C':             /* Single char !DANGEROUS! */
11822                     case 'd': case 'D':   /* digit class */
11823                     case 'g': case 'G':   /* generic-backref, pos assertion */
11824                     case 'h': case 'H':   /* HORIZWS */
11825                     case 'k': case 'K':   /* named backref, keep marker */
11826                     case 'p': case 'P':   /* Unicode property */
11827                               case 'R':   /* LNBREAK */
11828                     case 's': case 'S':   /* space class */
11829                     case 'v': case 'V':   /* VERTWS */
11830                     case 'w': case 'W':   /* word class */
11831                     case 'X':             /* eXtended Unicode "combining
11832                                              character sequence" */
11833                     case 'z': case 'Z':   /* End of line/string assertion */
11834                         --p;
11835                         goto loopdone;
11836
11837                     /* Anything after here is an escape that resolves to a
11838                        literal. (Except digits, which may or may not)
11839                      */
11840                     case 'n':
11841                         ender = '\n';
11842                         p++;
11843                         break;
11844                     case 'N': /* Handle a single-code point named character. */
11845                         /* The options cause it to fail if a multiple code
11846                          * point sequence.  Handle those in the switch() above
11847                          * */
11848                         RExC_parse = p + 1;
11849                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11850                                             flagp, depth, FALSE,
11851                                             FALSE /* not strict */ ))
11852                         {
11853                             if (*flagp & RESTART_UTF8)
11854                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11855                             RExC_parse = p = oldp;
11856                             goto loopdone;
11857                         }
11858                         p = RExC_parse;
11859                         if (ender > 0xff) {
11860                             REQUIRE_UTF8;
11861                         }
11862                         break;
11863                     case 'r':
11864                         ender = '\r';
11865                         p++;
11866                         break;
11867                     case 't':
11868                         ender = '\t';
11869                         p++;
11870                         break;
11871                     case 'f':
11872                         ender = '\f';
11873                         p++;
11874                         break;
11875                     case 'e':
11876                           ender = ASCII_TO_NATIVE('\033');
11877                         p++;
11878                         break;
11879                     case 'a':
11880                           ender = '\a';
11881                         p++;
11882                         break;
11883                     case 'o':
11884                         {
11885                             UV result;
11886                             const char* error_msg;
11887
11888                             bool valid = grok_bslash_o(&p,
11889                                                        &result,
11890                                                        &error_msg,
11891                                                        TRUE, /* out warnings */
11892                                                        FALSE, /* not strict */
11893                                                        TRUE, /* Output warnings
11894                                                                 for non-
11895                                                                 portables */
11896                                                        UTF);
11897                             if (! valid) {
11898                                 RExC_parse = p; /* going to die anyway; point
11899                                                    to exact spot of failure */
11900                                 vFAIL(error_msg);
11901                             }
11902                             ender = result;
11903                             if (PL_encoding && ender < 0x100) {
11904                                 goto recode_encoding;
11905                             }
11906                             if (ender > 0xff) {
11907                                 REQUIRE_UTF8;
11908                             }
11909                             break;
11910                         }
11911                     case 'x':
11912                         {
11913                             UV result = UV_MAX; /* initialize to erroneous
11914                                                    value */
11915                             const char* error_msg;
11916
11917                             bool valid = grok_bslash_x(&p,
11918                                                        &result,
11919                                                        &error_msg,
11920                                                        TRUE, /* out warnings */
11921                                                        FALSE, /* not strict */
11922                                                        TRUE, /* Output warnings
11923                                                                 for non-
11924                                                                 portables */
11925                                                        UTF);
11926                             if (! valid) {
11927                                 RExC_parse = p; /* going to die anyway; point
11928                                                    to exact spot of failure */
11929                                 vFAIL(error_msg);
11930                             }
11931                             ender = result;
11932
11933                             if (PL_encoding && ender < 0x100) {
11934                                 goto recode_encoding;
11935                             }
11936                             if (ender > 0xff) {
11937                                 REQUIRE_UTF8;
11938                             }
11939                             break;
11940                         }
11941                     case 'c':
11942                         p++;
11943                         ender = grok_bslash_c(*p++, SIZE_ONLY);
11944                         break;
11945                     case '8': case '9': /* must be a backreference */
11946                         --p;
11947                         goto loopdone;
11948                     case '1': case '2': case '3':case '4':
11949                     case '5': case '6': case '7':
11950                         /* When we parse backslash escapes there is ambiguity
11951                          * between backreferences and octal escapes. Any escape
11952                          * from \1 - \9 is a backreference, any multi-digit
11953                          * escape which does not start with 0 and which when
11954                          * evaluated as decimal could refer to an already
11955                          * parsed capture buffer is a backslash. Anything else
11956                          * is octal.
11957                          *
11958                          * Note this implies that \118 could be interpreted as
11959                          * 118 OR as "\11" . "8" depending on whether there
11960                          * were 118 capture buffers defined already in the
11961                          * pattern.  */
11962
11963                         /* NOTE, RExC_npar is 1 more than the actual number of
11964                          * parens we have seen so far, hence the < RExC_npar below. */
11965
11966                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11967                         {  /* Not to be treated as an octal constant, go
11968                                    find backref */
11969                             --p;
11970                             goto loopdone;
11971                         }
11972                         /* FALLTHROUGH */
11973                     case '0':
11974                         {
11975                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11976                             STRLEN numlen = 3;
11977                             ender = grok_oct(p, &numlen, &flags, NULL);
11978                             if (ender > 0xff) {
11979                                 REQUIRE_UTF8;
11980                             }
11981                             p += numlen;
11982                             if (SIZE_ONLY   /* like \08, \178 */
11983                                 && numlen < 3
11984                                 && p < RExC_end
11985                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11986                             {
11987                                 reg_warn_non_literal_string(
11988                                          p + 1,
11989                                          form_short_octal_warning(p, numlen));
11990                             }
11991                         }
11992                         if (PL_encoding && ender < 0x100)
11993                             goto recode_encoding;
11994                         break;
11995                     recode_encoding:
11996                         if (! RExC_override_recoding) {
11997                             SV* enc = PL_encoding;
11998                             ender = reg_recode((const char)(U8)ender, &enc);
11999                             if (!enc && SIZE_ONLY)
12000                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12001                             REQUIRE_UTF8;
12002                         }
12003                         break;
12004                     case '\0':
12005                         if (p >= RExC_end)
12006                             FAIL("Trailing \\");
12007                         /* FALLTHROUGH */
12008                     default:
12009                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12010                             /* Include any { following the alpha to emphasize
12011                              * that it could be part of an escape at some point
12012                              * in the future */
12013                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12014                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12015                         }
12016                         goto normal_default;
12017                     } /* End of switch on '\' */
12018                     break;
12019                 case '{':
12020                     /* Currently we don't warn when the lbrace is at the start
12021                      * of a construct.  This catches it in the middle of a
12022                      * literal string, or when its the first thing after
12023                      * something like "\b" */
12024                     if (! SIZE_ONLY
12025                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12026                     {
12027                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12028                     }
12029                     /*FALLTHROUGH*/
12030                 default:    /* A literal character */
12031                   normal_default:
12032                     if (UTF8_IS_START(*p) && UTF) {
12033                         STRLEN numlen;
12034                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12035                                                &numlen, UTF8_ALLOW_DEFAULT);
12036                         p += numlen;
12037                     }
12038                     else
12039                         ender = (U8) *p++;
12040                     break;
12041                 } /* End of switch on the literal */
12042
12043                 /* Here, have looked at the literal character and <ender>
12044                  * contains its ordinal, <p> points to the character after it
12045                  */
12046
12047                 if ( RExC_flags & RXf_PMf_EXTENDED)
12048                     p = regpatws(pRExC_state, p,
12049                                           TRUE); /* means recognize comments */
12050
12051                 /* If the next thing is a quantifier, it applies to this
12052                  * character only, which means that this character has to be in
12053                  * its own node and can't just be appended to the string in an
12054                  * existing node, so if there are already other characters in
12055                  * the node, close the node with just them, and set up to do
12056                  * this character again next time through, when it will be the
12057                  * only thing in its new node */
12058                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12059                 {
12060                     p = oldp;
12061                     goto loopdone;
12062                 }
12063
12064                 if (! FOLD   /* The simple case, just append the literal */
12065                     || (LOC  /* Also don't fold for tricky chars under /l */
12066                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12067                 {
12068                     if (UTF) {
12069                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12070                         if (unilen > 0) {
12071                            s   += unilen;
12072                            len += unilen;
12073                         }
12074
12075                         /* The loop increments <len> each time, as all but this
12076                          * path (and one other) through it add a single byte to
12077                          * the EXACTish node.  But this one has changed len to
12078                          * be the correct final value, so subtract one to
12079                          * cancel out the increment that follows */
12080                         len--;
12081                     }
12082                     else {
12083                         REGC((char)ender, s++);
12084                     }
12085
12086                     /* Can get here if folding only if is one of the /l
12087                      * characters whose fold depends on the locale.  The
12088                      * occurrence of any of these indicate that we can't
12089                      * simplify things */
12090                     if (FOLD) {
12091                         maybe_exact = FALSE;
12092                         maybe_exactfu = FALSE;
12093                     }
12094                 }
12095                 else             /* FOLD */
12096                      if (! ( UTF
12097                         /* See comments for join_exact() as to why we fold this
12098                          * non-UTF at compile time */
12099                         || (node_type == EXACTFU
12100                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12101                 {
12102                     /* Here, are folding and are not UTF-8 encoded; therefore
12103                      * the character must be in the range 0-255, and is not /l
12104                      * (Not /l because we already handled these under /l in
12105                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12106                     if (IS_IN_SOME_FOLD_L1(ender)) {
12107                         maybe_exact = FALSE;
12108
12109                         /* See if the character's fold differs between /d and
12110                          * /u.  This includes the multi-char fold SHARP S to
12111                          * 'ss' */
12112                         if (maybe_exactfu
12113                             && (PL_fold[ender] != PL_fold_latin1[ender]
12114                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12115                                 || (len > 0
12116                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12117                                    && isARG2_lower_or_UPPER_ARG1('s',
12118                                                                  *(s-1)))))
12119                         {
12120                             maybe_exactfu = FALSE;
12121                         }
12122                     }
12123
12124                     /* Even when folding, we store just the input character, as
12125                      * we have an array that finds its fold quickly */
12126                     *(s++) = (char) ender;
12127                 }
12128                 else {  /* FOLD and UTF */
12129                     /* Unlike the non-fold case, we do actually have to
12130                      * calculate the results here in pass 1.  This is for two
12131                      * reasons, the folded length may be longer than the
12132                      * unfolded, and we have to calculate how many EXACTish
12133                      * nodes it will take; and we may run out of room in a node
12134                      * in the middle of a potential multi-char fold, and have
12135                      * to back off accordingly.  (Hence we can't use REGC for
12136                      * the simple case just below.) */
12137
12138                     UV folded;
12139                     if (isASCII(ender)) {
12140                         folded = toFOLD(ender);
12141                         *(s)++ = (U8) folded;
12142                     }
12143                     else {
12144                         STRLEN foldlen;
12145
12146                         folded = _to_uni_fold_flags(
12147                                      ender,
12148                                      (U8 *) s,
12149                                      &foldlen,
12150                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12151                                                         ? FOLD_FLAGS_NOMIX_ASCII
12152                                                         : 0));
12153                         s += foldlen;
12154
12155                         /* The loop increments <len> each time, as all but this
12156                          * path (and one other) through it add a single byte to
12157                          * the EXACTish node.  But this one has changed len to
12158                          * be the correct final value, so subtract one to
12159                          * cancel out the increment that follows */
12160                         len += foldlen - 1;
12161                     }
12162                     /* If this node only contains non-folding code points so
12163                      * far, see if this new one is also non-folding */
12164                     if (maybe_exact) {
12165                         if (folded != ender) {
12166                             maybe_exact = FALSE;
12167                         }
12168                         else {
12169                             /* Here the fold is the original; we have to check
12170                              * further to see if anything folds to it */
12171                             if (_invlist_contains_cp(PL_utf8_foldable,
12172                                                         ender))
12173                             {
12174                                 maybe_exact = FALSE;
12175                             }
12176                         }
12177                     }
12178                     ender = folded;
12179                 }
12180
12181                 if (next_is_quantifier) {
12182
12183                     /* Here, the next input is a quantifier, and to get here,
12184                      * the current character is the only one in the node.
12185                      * Also, here <len> doesn't include the final byte for this
12186                      * character */
12187                     len++;
12188                     goto loopdone;
12189                 }
12190
12191             } /* End of loop through literal characters */
12192
12193             /* Here we have either exhausted the input or ran out of room in
12194              * the node.  (If we encountered a character that can't be in the
12195              * node, transfer is made directly to <loopdone>, and so we
12196              * wouldn't have fallen off the end of the loop.)  In the latter
12197              * case, we artificially have to split the node into two, because
12198              * we just don't have enough space to hold everything.  This
12199              * creates a problem if the final character participates in a
12200              * multi-character fold in the non-final position, as a match that
12201              * should have occurred won't, due to the way nodes are matched,
12202              * and our artificial boundary.  So back off until we find a non-
12203              * problematic character -- one that isn't at the beginning or
12204              * middle of such a fold.  (Either it doesn't participate in any
12205              * folds, or appears only in the final position of all the folds it
12206              * does participate in.)  A better solution with far fewer false
12207              * positives, and that would fill the nodes more completely, would
12208              * be to actually have available all the multi-character folds to
12209              * test against, and to back-off only far enough to be sure that
12210              * this node isn't ending with a partial one.  <upper_parse> is set
12211              * further below (if we need to reparse the node) to include just
12212              * up through that final non-problematic character that this code
12213              * identifies, so when it is set to less than the full node, we can
12214              * skip the rest of this */
12215             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12216
12217                 const STRLEN full_len = len;
12218
12219                 assert(len >= MAX_NODE_STRING_SIZE);
12220
12221                 /* Here, <s> points to the final byte of the final character.
12222                  * Look backwards through the string until find a non-
12223                  * problematic character */
12224
12225                 if (! UTF) {
12226
12227                     /* This has no multi-char folds to non-UTF characters */
12228                     if (ASCII_FOLD_RESTRICTED) {
12229                         goto loopdone;
12230                     }
12231
12232                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12233                     len = s - s0 + 1;
12234                 }
12235                 else {
12236                     if (!  PL_NonL1NonFinalFold) {
12237                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12238                                         NonL1_Perl_Non_Final_Folds_invlist);
12239                     }
12240
12241                     /* Point to the first byte of the final character */
12242                     s = (char *) utf8_hop((U8 *) s, -1);
12243
12244                     while (s >= s0) {   /* Search backwards until find
12245                                            non-problematic char */
12246                         if (UTF8_IS_INVARIANT(*s)) {
12247
12248                             /* There are no ascii characters that participate
12249                              * in multi-char folds under /aa.  In EBCDIC, the
12250                              * non-ascii invariants are all control characters,
12251                              * so don't ever participate in any folds. */
12252                             if (ASCII_FOLD_RESTRICTED
12253                                 || ! IS_NON_FINAL_FOLD(*s))
12254                             {
12255                                 break;
12256                             }
12257                         }
12258                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12259                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12260                                                                   *s, *(s+1))))
12261                             {
12262                                 break;
12263                             }
12264                         }
12265                         else if (! _invlist_contains_cp(
12266                                         PL_NonL1NonFinalFold,
12267                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12268                         {
12269                             break;
12270                         }
12271
12272                         /* Here, the current character is problematic in that
12273                          * it does occur in the non-final position of some
12274                          * fold, so try the character before it, but have to
12275                          * special case the very first byte in the string, so
12276                          * we don't read outside the string */
12277                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12278                     } /* End of loop backwards through the string */
12279
12280                     /* If there were only problematic characters in the string,
12281                      * <s> will point to before s0, in which case the length
12282                      * should be 0, otherwise include the length of the
12283                      * non-problematic character just found */
12284                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12285                 }
12286
12287                 /* Here, have found the final character, if any, that is
12288                  * non-problematic as far as ending the node without splitting
12289                  * it across a potential multi-char fold.  <len> contains the
12290                  * number of bytes in the node up-to and including that
12291                  * character, or is 0 if there is no such character, meaning
12292                  * the whole node contains only problematic characters.  In
12293                  * this case, give up and just take the node as-is.  We can't
12294                  * do any better */
12295                 if (len == 0) {
12296                     len = full_len;
12297
12298                     /* If the node ends in an 's' we make sure it stays EXACTF,
12299                      * as if it turns into an EXACTFU, it could later get
12300                      * joined with another 's' that would then wrongly match
12301                      * the sharp s */
12302                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12303                     {
12304                         maybe_exactfu = FALSE;
12305                     }
12306                 } else {
12307
12308                     /* Here, the node does contain some characters that aren't
12309                      * problematic.  If one such is the final character in the
12310                      * node, we are done */
12311                     if (len == full_len) {
12312                         goto loopdone;
12313                     }
12314                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12315
12316                         /* If the final character is problematic, but the
12317                          * penultimate is not, back-off that last character to
12318                          * later start a new node with it */
12319                         p = oldp;
12320                         goto loopdone;
12321                     }
12322
12323                     /* Here, the final non-problematic character is earlier
12324                      * in the input than the penultimate character.  What we do
12325                      * is reparse from the beginning, going up only as far as
12326                      * this final ok one, thus guaranteeing that the node ends
12327                      * in an acceptable character.  The reason we reparse is
12328                      * that we know how far in the character is, but we don't
12329                      * know how to correlate its position with the input parse.
12330                      * An alternate implementation would be to build that
12331                      * correlation as we go along during the original parse,
12332                      * but that would entail extra work for every node, whereas
12333                      * this code gets executed only when the string is too
12334                      * large for the node, and the final two characters are
12335                      * problematic, an infrequent occurrence.  Yet another
12336                      * possible strategy would be to save the tail of the
12337                      * string, and the next time regatom is called, initialize
12338                      * with that.  The problem with this is that unless you
12339                      * back off one more character, you won't be guaranteed
12340                      * regatom will get called again, unless regbranch,
12341                      * regpiece ... are also changed.  If you do back off that
12342                      * extra character, so that there is input guaranteed to
12343                      * force calling regatom, you can't handle the case where
12344                      * just the first character in the node is acceptable.  I
12345                      * (khw) decided to try this method which doesn't have that
12346                      * pitfall; if performance issues are found, we can do a
12347                      * combination of the current approach plus that one */
12348                     upper_parse = len;
12349                     len = 0;
12350                     s = s0;
12351                     goto reparse;
12352                 }
12353             }   /* End of verifying node ends with an appropriate char */
12354
12355         loopdone:   /* Jumped to when encounters something that shouldn't be in
12356                        the node */
12357
12358             /* I (khw) don't know if you can get here with zero length, but the
12359              * old code handled this situation by creating a zero-length EXACT
12360              * node.  Might as well be NOTHING instead */
12361             if (len == 0) {
12362                 OP(ret) = NOTHING;
12363             }
12364             else {
12365                 if (FOLD) {
12366                     /* If 'maybe_exact' is still set here, means there are no
12367                      * code points in the node that participate in folds;
12368                      * similarly for 'maybe_exactfu' and code points that match
12369                      * differently depending on UTF8ness of the target string
12370                      * (for /u), or depending on locale for /l */
12371                     if (maybe_exact) {
12372                         OP(ret) = EXACT;
12373                     }
12374                     else if (maybe_exactfu) {
12375                         OP(ret) = EXACTFU;
12376                     }
12377                 }
12378                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12379                                            FALSE /* Don't look to see if could
12380                                                     be turned into an EXACT
12381                                                     node, as we have already
12382                                                     computed that */
12383                                           );
12384             }
12385
12386             RExC_parse = p - 1;
12387             Set_Node_Cur_Length(ret, parse_start);
12388             nextchar(pRExC_state);
12389             {
12390                 /* len is STRLEN which is unsigned, need to copy to signed */
12391                 IV iv = len;
12392                 if (iv < 0)
12393                     vFAIL("Internal disaster");
12394             }
12395
12396         } /* End of label 'defchar:' */
12397         break;
12398     } /* End of giant switch on input character */
12399
12400     return(ret);
12401 }
12402
12403 STATIC char *
12404 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12405 {
12406     /* Returns the next non-pattern-white space, non-comment character (the
12407      * latter only if 'recognize_comment is true) in the string p, which is
12408      * ended by RExC_end.  See also reg_skipcomment */
12409     const char *e = RExC_end;
12410
12411     PERL_ARGS_ASSERT_REGPATWS;
12412
12413     while (p < e) {
12414         STRLEN len;
12415         if ((len = is_PATWS_safe(p, e, UTF))) {
12416             p += len;
12417         }
12418         else if (recognize_comment && *p == '#') {
12419             p = reg_skipcomment(pRExC_state, p);
12420         }
12421         else
12422             break;
12423     }
12424     return p;
12425 }
12426
12427 STATIC void
12428 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12429 {
12430     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12431      * sets up the bitmap and any flags, removing those code points from the
12432      * inversion list, setting it to NULL should it become completely empty */
12433
12434     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12435     assert(PL_regkind[OP(node)] == ANYOF);
12436
12437     ANYOF_BITMAP_ZERO(node);
12438     if (*invlist_ptr) {
12439
12440         /* This gets set if we actually need to modify things */
12441         bool change_invlist = FALSE;
12442
12443         UV start, end;
12444
12445         /* Start looking through *invlist_ptr */
12446         invlist_iterinit(*invlist_ptr);
12447         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12448             UV high;
12449             int i;
12450
12451             if (end == UV_MAX && start <= 256) {
12452                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12453             }
12454             else if (end >= 256) {
12455                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12456             }
12457
12458             /* Quit if are above what we should change */
12459             if (start > 255) {
12460                 break;
12461             }
12462
12463             change_invlist = TRUE;
12464
12465             /* Set all the bits in the range, up to the max that we are doing */
12466             high = (end < 255) ? end : 255;
12467             for (i = start; i <= (int) high; i++) {
12468                 if (! ANYOF_BITMAP_TEST(node, i)) {
12469                     ANYOF_BITMAP_SET(node, i);
12470                 }
12471             }
12472         }
12473         invlist_iterfinish(*invlist_ptr);
12474
12475         /* Done with loop; remove any code points that are in the bitmap from
12476          * *invlist_ptr; similarly for code points above latin1 if we have a
12477          * flag to match all of them anyways */
12478         if (change_invlist) {
12479             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12480         }
12481         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12482             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12483         }
12484
12485         /* If have completely emptied it, remove it completely */
12486         if (_invlist_len(*invlist_ptr) == 0) {
12487             SvREFCNT_dec_NN(*invlist_ptr);
12488             *invlist_ptr = NULL;
12489         }
12490     }
12491 }
12492
12493 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12494    Character classes ([:foo:]) can also be negated ([:^foo:]).
12495    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12496    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12497    but trigger failures because they are currently unimplemented. */
12498
12499 #define POSIXCC_DONE(c)   ((c) == ':')
12500 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12501 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12502
12503 PERL_STATIC_INLINE I32
12504 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12505 {
12506     I32 namedclass = OOB_NAMEDCLASS;
12507
12508     PERL_ARGS_ASSERT_REGPPOSIXCC;
12509
12510     if (value == '[' && RExC_parse + 1 < RExC_end &&
12511         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12512         POSIXCC(UCHARAT(RExC_parse)))
12513     {
12514         const char c = UCHARAT(RExC_parse);
12515         char* const s = RExC_parse++;
12516
12517         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12518             RExC_parse++;
12519         if (RExC_parse == RExC_end) {
12520             if (strict) {
12521
12522                 /* Try to give a better location for the error (than the end of
12523                  * the string) by looking for the matching ']' */
12524                 RExC_parse = s;
12525                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12526                     RExC_parse++;
12527                 }
12528                 vFAIL2("Unmatched '%c' in POSIX class", c);
12529             }
12530             /* Grandfather lone [:, [=, [. */
12531             RExC_parse = s;
12532         }
12533         else {
12534             const char* const t = RExC_parse++; /* skip over the c */
12535             assert(*t == c);
12536
12537             if (UCHARAT(RExC_parse) == ']') {
12538                 const char *posixcc = s + 1;
12539                 RExC_parse++; /* skip over the ending ] */
12540
12541                 if (*s == ':') {
12542                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12543                     const I32 skip = t - posixcc;
12544
12545                     /* Initially switch on the length of the name.  */
12546                     switch (skip) {
12547                     case 4:
12548                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12549                                                           this is the Perl \w
12550                                                         */
12551                             namedclass = ANYOF_WORDCHAR;
12552                         break;
12553                     case 5:
12554                         /* Names all of length 5.  */
12555                         /* alnum alpha ascii blank cntrl digit graph lower
12556                            print punct space upper  */
12557                         /* Offset 4 gives the best switch position.  */
12558                         switch (posixcc[4]) {
12559                         case 'a':
12560                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12561                                 namedclass = ANYOF_ALPHA;
12562                             break;
12563                         case 'e':
12564                             if (memEQ(posixcc, "spac", 4)) /* space */
12565                                 namedclass = ANYOF_PSXSPC;
12566                             break;
12567                         case 'h':
12568                             if (memEQ(posixcc, "grap", 4)) /* graph */
12569                                 namedclass = ANYOF_GRAPH;
12570                             break;
12571                         case 'i':
12572                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12573                                 namedclass = ANYOF_ASCII;
12574                             break;
12575                         case 'k':
12576                             if (memEQ(posixcc, "blan", 4)) /* blank */
12577                                 namedclass = ANYOF_BLANK;
12578                             break;
12579                         case 'l':
12580                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12581                                 namedclass = ANYOF_CNTRL;
12582                             break;
12583                         case 'm':
12584                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12585                                 namedclass = ANYOF_ALPHANUMERIC;
12586                             break;
12587                         case 'r':
12588                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12589                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12590                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12591                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12592                             break;
12593                         case 't':
12594                             if (memEQ(posixcc, "digi", 4)) /* digit */
12595                                 namedclass = ANYOF_DIGIT;
12596                             else if (memEQ(posixcc, "prin", 4)) /* print */
12597                                 namedclass = ANYOF_PRINT;
12598                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12599                                 namedclass = ANYOF_PUNCT;
12600                             break;
12601                         }
12602                         break;
12603                     case 6:
12604                         if (memEQ(posixcc, "xdigit", 6))
12605                             namedclass = ANYOF_XDIGIT;
12606                         break;
12607                     }
12608
12609                     if (namedclass == OOB_NAMEDCLASS)
12610                         vFAIL2utf8f(
12611                             "POSIX class [:%"UTF8f":] unknown",
12612                             UTF8fARG(UTF, t - s - 1, s + 1));
12613
12614                     /* The #defines are structured so each complement is +1 to
12615                      * the normal one */
12616                     if (complement) {
12617                         namedclass++;
12618                     }
12619                     assert (posixcc[skip] == ':');
12620                     assert (posixcc[skip+1] == ']');
12621                 } else if (!SIZE_ONLY) {
12622                     /* [[=foo=]] and [[.foo.]] are still future. */
12623
12624                     /* adjust RExC_parse so the warning shows after
12625                        the class closes */
12626                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12627                         RExC_parse++;
12628                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12629                 }
12630             } else {
12631                 /* Maternal grandfather:
12632                  * "[:" ending in ":" but not in ":]" */
12633                 if (strict) {
12634                     vFAIL("Unmatched '[' in POSIX class");
12635                 }
12636
12637                 /* Grandfather lone [:, [=, [. */
12638                 RExC_parse = s;
12639             }
12640         }
12641     }
12642
12643     return namedclass;
12644 }
12645
12646 STATIC bool
12647 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12648 {
12649     /* This applies some heuristics at the current parse position (which should
12650      * be at a '[') to see if what follows might be intended to be a [:posix:]
12651      * class.  It returns true if it really is a posix class, of course, but it
12652      * also can return true if it thinks that what was intended was a posix
12653      * class that didn't quite make it.
12654      *
12655      * It will return true for
12656      *      [:alphanumerics:
12657      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12658      *                         ')' indicating the end of the (?[
12659      *      [:any garbage including %^&$ punctuation:]
12660      *
12661      * This is designed to be called only from S_handle_regex_sets; it could be
12662      * easily adapted to be called from the spot at the beginning of regclass()
12663      * that checks to see in a normal bracketed class if the surrounding []
12664      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12665      * change long-standing behavior, so I (khw) didn't do that */
12666     char* p = RExC_parse + 1;
12667     char first_char = *p;
12668
12669     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12670
12671     assert(*(p - 1) == '[');
12672
12673     if (! POSIXCC(first_char)) {
12674         return FALSE;
12675     }
12676
12677     p++;
12678     while (p < RExC_end && isWORDCHAR(*p)) p++;
12679
12680     if (p >= RExC_end) {
12681         return FALSE;
12682     }
12683
12684     if (p - RExC_parse > 2    /* Got at least 1 word character */
12685         && (*p == first_char
12686             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12687     {
12688         return TRUE;
12689     }
12690
12691     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12692
12693     return (p
12694             && p - RExC_parse > 2 /* [:] evaluates to colon;
12695                                       [::] is a bad posix class. */
12696             && first_char == *(p - 1));
12697 }
12698
12699 STATIC regnode *
12700 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12701                     I32 *flagp, U32 depth,
12702                     char * const oregcomp_parse)
12703 {
12704     /* Handle the (?[...]) construct to do set operations */
12705
12706     U8 curchar;
12707     UV start, end;      /* End points of code point ranges */
12708     SV* result_string;
12709     char *save_end, *save_parse;
12710     SV* final;
12711     STRLEN len;
12712     regnode* node;
12713     AV* stack;
12714     const bool save_fold = FOLD;
12715
12716     GET_RE_DEBUG_FLAGS_DECL;
12717
12718     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12719
12720     if (LOC) {
12721         vFAIL("(?[...]) not valid in locale");
12722     }
12723     RExC_uni_semantics = 1;
12724
12725     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12726      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12727      * call regclass to handle '[]' so as to not have to reinvent its parsing
12728      * rules here (throwing away the size it computes each time).  And, we exit
12729      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12730      * these things, we need to realize that something preceded by a backslash
12731      * is escaped, so we have to keep track of backslashes */
12732     if (SIZE_ONLY) {
12733         UV depth = 0; /* how many nested (?[...]) constructs */
12734
12735         Perl_ck_warner_d(aTHX_
12736             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12737             "The regex_sets feature is experimental" REPORT_LOCATION,
12738                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12739                 UTF8fARG(UTF,
12740                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12741                          RExC_precomp + (RExC_parse - RExC_precomp)));
12742
12743         while (RExC_parse < RExC_end) {
12744             SV* current = NULL;
12745             RExC_parse = regpatws(pRExC_state, RExC_parse,
12746                                           TRUE); /* means recognize comments */
12747             switch (*RExC_parse) {
12748                 case '?':
12749                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12750                     /* FALLTHROUGH */
12751                 default:
12752                     break;
12753                 case '\\':
12754                     /* Skip the next byte (which could cause us to end up in
12755                      * the middle of a UTF-8 character, but since none of those
12756                      * are confusable with anything we currently handle in this
12757                      * switch (invariants all), it's safe.  We'll just hit the
12758                      * default: case next time and keep on incrementing until
12759                      * we find one of the invariants we do handle. */
12760                     RExC_parse++;
12761                     break;
12762                 case '[':
12763                 {
12764                     /* If this looks like it is a [:posix:] class, leave the
12765                      * parse pointer at the '[' to fool regclass() into
12766                      * thinking it is part of a '[[:posix:]]'.  That function
12767                      * will use strict checking to force a syntax error if it
12768                      * doesn't work out to a legitimate class */
12769                     bool is_posix_class
12770                                     = could_it_be_a_POSIX_class(pRExC_state);
12771                     if (! is_posix_class) {
12772                         RExC_parse++;
12773                     }
12774
12775                     /* regclass() can only return RESTART_UTF8 if multi-char
12776                        folds are allowed.  */
12777                     if (!regclass(pRExC_state, flagp,depth+1,
12778                                   is_posix_class, /* parse the whole char
12779                                                      class only if not a
12780                                                      posix class */
12781                                   FALSE, /* don't allow multi-char folds */
12782                                   TRUE, /* silence non-portable warnings. */
12783                                   &current))
12784                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12785                               (UV) *flagp);
12786
12787                     /* function call leaves parse pointing to the ']', except
12788                      * if we faked it */
12789                     if (is_posix_class) {
12790                         RExC_parse--;
12791                     }
12792
12793                     SvREFCNT_dec(current);   /* In case it returned something */
12794                     break;
12795                 }
12796
12797                 case ']':
12798                     if (depth--) break;
12799                     RExC_parse++;
12800                     if (RExC_parse < RExC_end
12801                         && *RExC_parse == ')')
12802                     {
12803                         node = reganode(pRExC_state, ANYOF, 0);
12804                         RExC_size += ANYOF_SKIP;
12805                         nextchar(pRExC_state);
12806                         Set_Node_Length(node,
12807                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12808                         return node;
12809                     }
12810                     goto no_close;
12811             }
12812             RExC_parse++;
12813         }
12814
12815         no_close:
12816         FAIL("Syntax error in (?[...])");
12817     }
12818
12819     /* Pass 2 only after this.  Everything in this construct is a
12820      * metacharacter.  Operands begin with either a '\' (for an escape
12821      * sequence), or a '[' for a bracketed character class.  Any other
12822      * character should be an operator, or parenthesis for grouping.  Both
12823      * types of operands are handled by calling regclass() to parse them.  It
12824      * is called with a parameter to indicate to return the computed inversion
12825      * list.  The parsing here is implemented via a stack.  Each entry on the
12826      * stack is a single character representing one of the operators, or the
12827      * '('; or else a pointer to an operand inversion list. */
12828
12829 #define IS_OPERAND(a)  (! SvIOK(a))
12830
12831     /* The stack starts empty.  It is a syntax error if the first thing parsed
12832      * is a binary operator; everything else is pushed on the stack.  When an
12833      * operand is parsed, the top of the stack is examined.  If it is a binary
12834      * operator, the item before it should be an operand, and both are replaced
12835      * by the result of doing that operation on the new operand and the one on
12836      * the stack.   Thus a sequence of binary operands is reduced to a single
12837      * one before the next one is parsed.
12838      *
12839      * A unary operator may immediately follow a binary in the input, for
12840      * example
12841      *      [a] + ! [b]
12842      * When an operand is parsed and the top of the stack is a unary operator,
12843      * the operation is performed, and then the stack is rechecked to see if
12844      * this new operand is part of a binary operation; if so, it is handled as
12845      * above.
12846      *
12847      * A '(' is simply pushed on the stack; it is valid only if the stack is
12848      * empty, or the top element of the stack is an operator or another '('
12849      * (for which the parenthesized expression will become an operand).  By the
12850      * time the corresponding ')' is parsed everything in between should have
12851      * been parsed and evaluated to a single operand (or else is a syntax
12852      * error), and is handled as a regular operand */
12853
12854     sv_2mortal((SV *)(stack = newAV()));
12855
12856     while (RExC_parse < RExC_end) {
12857         I32 top_index = av_tindex(stack);
12858         SV** top_ptr;
12859         SV* current = NULL;
12860
12861         /* Skip white space */
12862         RExC_parse = regpatws(pRExC_state, RExC_parse,
12863                                          TRUE /* means recognize comments */ );
12864         if (RExC_parse >= RExC_end) {
12865             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12866         }
12867         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12868             break;
12869         }
12870
12871         switch (curchar) {
12872
12873             case '?':
12874                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12875                                                safely subtract 1 from
12876                                                RExC_parse in the next clause.
12877                                                If we have something on the
12878                                                stack, we have parsed something
12879                                              */
12880                     && UCHARAT(RExC_parse - 1) == '('
12881                     && RExC_parse < RExC_end)
12882                 {
12883                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12884                      * This happens when we have some thing like
12885                      *
12886                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12887                      *   ...
12888                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12889                      *
12890                      * Here we would be handling the interpolated
12891                      * '$thai_or_lao'.  We handle this by a recursive call to
12892                      * ourselves which returns the inversion list the
12893                      * interpolated expression evaluates to.  We use the flags
12894                      * from the interpolated pattern. */
12895                     U32 save_flags = RExC_flags;
12896                     const char * const save_parse = ++RExC_parse;
12897
12898                     parse_lparen_question_flags(pRExC_state);
12899
12900                     if (RExC_parse == save_parse  /* Makes sure there was at
12901                                                      least one flag (or this
12902                                                      embedding wasn't compiled)
12903                                                    */
12904                         || RExC_parse >= RExC_end - 4
12905                         || UCHARAT(RExC_parse) != ':'
12906                         || UCHARAT(++RExC_parse) != '('
12907                         || UCHARAT(++RExC_parse) != '?'
12908                         || UCHARAT(++RExC_parse) != '[')
12909                     {
12910
12911                         /* In combination with the above, this moves the
12912                          * pointer to the point just after the first erroneous
12913                          * character (or if there are no flags, to where they
12914                          * should have been) */
12915                         if (RExC_parse >= RExC_end - 4) {
12916                             RExC_parse = RExC_end;
12917                         }
12918                         else if (RExC_parse != save_parse) {
12919                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12920                         }
12921                         vFAIL("Expecting '(?flags:(?[...'");
12922                     }
12923                     RExC_parse++;
12924                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12925                                                     depth+1, oregcomp_parse);
12926
12927                     /* Here, 'current' contains the embedded expression's
12928                      * inversion list, and RExC_parse points to the trailing
12929                      * ']'; the next character should be the ')' which will be
12930                      * paired with the '(' that has been put on the stack, so
12931                      * the whole embedded expression reduces to '(operand)' */
12932                     RExC_parse++;
12933
12934                     RExC_flags = save_flags;
12935                     goto handle_operand;
12936                 }
12937                 /* FALLTHROUGH */
12938
12939             default:
12940                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12941                 vFAIL("Unexpected character");
12942
12943             case '\\':
12944                 /* regclass() can only return RESTART_UTF8 if multi-char
12945                    folds are allowed.  */
12946                 if (!regclass(pRExC_state, flagp,depth+1,
12947                               TRUE, /* means parse just the next thing */
12948                               FALSE, /* don't allow multi-char folds */
12949                               FALSE, /* don't silence non-portable warnings.  */
12950                               &current))
12951                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12952                           (UV) *flagp);
12953                 /* regclass() will return with parsing just the \ sequence,
12954                  * leaving the parse pointer at the next thing to parse */
12955                 RExC_parse--;
12956                 goto handle_operand;
12957
12958             case '[':   /* Is a bracketed character class */
12959             {
12960                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12961
12962                 if (! is_posix_class) {
12963                     RExC_parse++;
12964                 }
12965
12966                 /* regclass() can only return RESTART_UTF8 if multi-char
12967                    folds are allowed.  */
12968                 if(!regclass(pRExC_state, flagp,depth+1,
12969                              is_posix_class, /* parse the whole char class
12970                                                 only if not a posix class */
12971                              FALSE, /* don't allow multi-char folds */
12972                              FALSE, /* don't silence non-portable warnings.  */
12973                              &current))
12974                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12975                           (UV) *flagp);
12976                 /* function call leaves parse pointing to the ']', except if we
12977                  * faked it */
12978                 if (is_posix_class) {
12979                     RExC_parse--;
12980                 }
12981
12982                 goto handle_operand;
12983             }
12984
12985             case '&':
12986             case '|':
12987             case '+':
12988             case '-':
12989             case '^':
12990                 if (top_index < 0
12991                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12992                     || ! IS_OPERAND(*top_ptr))
12993                 {
12994                     RExC_parse++;
12995                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12996                 }
12997                 av_push(stack, newSVuv(curchar));
12998                 break;
12999
13000             case '!':
13001                 av_push(stack, newSVuv(curchar));
13002                 break;
13003
13004             case '(':
13005                 if (top_index >= 0) {
13006                     top_ptr = av_fetch(stack, top_index, FALSE);
13007                     assert(top_ptr);
13008                     if (IS_OPERAND(*top_ptr)) {
13009                         RExC_parse++;
13010                         vFAIL("Unexpected '(' with no preceding operator");
13011                     }
13012                 }
13013                 av_push(stack, newSVuv(curchar));
13014                 break;
13015
13016             case ')':
13017             {
13018                 SV* lparen;
13019                 if (top_index < 1
13020                     || ! (current = av_pop(stack))
13021                     || ! IS_OPERAND(current)
13022                     || ! (lparen = av_pop(stack))
13023                     || IS_OPERAND(lparen)
13024                     || SvUV(lparen) != '(')
13025                 {
13026                     SvREFCNT_dec(current);
13027                     RExC_parse++;
13028                     vFAIL("Unexpected ')'");
13029                 }
13030                 top_index -= 2;
13031                 SvREFCNT_dec_NN(lparen);
13032
13033                 /* FALLTHROUGH */
13034             }
13035
13036               handle_operand:
13037
13038                 /* Here, we have an operand to process, in 'current' */
13039
13040                 if (top_index < 0) {    /* Just push if stack is empty */
13041                     av_push(stack, current);
13042                 }
13043                 else {
13044                     SV* top = av_pop(stack);
13045                     SV *prev = NULL;
13046                     char current_operator;
13047
13048                     if (IS_OPERAND(top)) {
13049                         SvREFCNT_dec_NN(top);
13050                         SvREFCNT_dec_NN(current);
13051                         vFAIL("Operand with no preceding operator");
13052                     }
13053                     current_operator = (char) SvUV(top);
13054                     switch (current_operator) {
13055                         case '(':   /* Push the '(' back on followed by the new
13056                                        operand */
13057                             av_push(stack, top);
13058                             av_push(stack, current);
13059                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13060                                                    just after the 'break', so
13061                                                    it doesn't get wrongly freed
13062                                                  */
13063                             break;
13064
13065                         case '!':
13066                             _invlist_invert(current);
13067
13068                             /* Unlike binary operators, the top of the stack,
13069                              * now that this unary one has been popped off, may
13070                              * legally be an operator, and we now have operand
13071                              * for it. */
13072                             top_index--;
13073                             SvREFCNT_dec_NN(top);
13074                             goto handle_operand;
13075
13076                         case '&':
13077                             prev = av_pop(stack);
13078                             _invlist_intersection(prev,
13079                                                    current,
13080                                                    &current);
13081                             av_push(stack, current);
13082                             break;
13083
13084                         case '|':
13085                         case '+':
13086                             prev = av_pop(stack);
13087                             _invlist_union(prev, current, &current);
13088                             av_push(stack, current);
13089                             break;
13090
13091                         case '-':
13092                             prev = av_pop(stack);;
13093                             _invlist_subtract(prev, current, &current);
13094                             av_push(stack, current);
13095                             break;
13096
13097                         case '^':   /* The union minus the intersection */
13098                         {
13099                             SV* i = NULL;
13100                             SV* u = NULL;
13101                             SV* element;
13102
13103                             prev = av_pop(stack);
13104                             _invlist_union(prev, current, &u);
13105                             _invlist_intersection(prev, current, &i);
13106                             /* _invlist_subtract will overwrite current
13107                                 without freeing what it already contains */
13108                             element = current;
13109                             _invlist_subtract(u, i, &current);
13110                             av_push(stack, current);
13111                             SvREFCNT_dec_NN(i);
13112                             SvREFCNT_dec_NN(u);
13113                             SvREFCNT_dec_NN(element);
13114                             break;
13115                         }
13116
13117                         default:
13118                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13119                 }
13120                 SvREFCNT_dec_NN(top);
13121                 SvREFCNT_dec(prev);
13122             }
13123         }
13124
13125         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13126     }
13127
13128     if (av_tindex(stack) < 0   /* Was empty */
13129         || ((final = av_pop(stack)) == NULL)
13130         || ! IS_OPERAND(final)
13131         || av_tindex(stack) >= 0)  /* More left on stack */
13132     {
13133         vFAIL("Incomplete expression within '(?[ ])'");
13134     }
13135
13136     /* Here, 'final' is the resultant inversion list from evaluating the
13137      * expression.  Return it if so requested */
13138     if (return_invlist) {
13139         *return_invlist = final;
13140         return END;
13141     }
13142
13143     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13144      * expecting a string of ranges and individual code points */
13145     invlist_iterinit(final);
13146     result_string = newSVpvs("");
13147     while (invlist_iternext(final, &start, &end)) {
13148         if (start == end) {
13149             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13150         }
13151         else {
13152             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13153                                                      start,          end);
13154         }
13155     }
13156
13157     save_parse = RExC_parse;
13158     RExC_parse = SvPV(result_string, len);
13159     save_end = RExC_end;
13160     RExC_end = RExC_parse + len;
13161
13162     /* We turn off folding around the call, as the class we have constructed
13163      * already has all folding taken into consideration, and we don't want
13164      * regclass() to add to that */
13165     RExC_flags &= ~RXf_PMf_FOLD;
13166     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13167      */
13168     node = regclass(pRExC_state, flagp,depth+1,
13169                     FALSE, /* means parse the whole char class */
13170                     FALSE, /* don't allow multi-char folds */
13171                     TRUE, /* silence non-portable warnings.  The above may very
13172                              well have generated non-portable code points, but
13173                              they're valid on this machine */
13174                     NULL);
13175     if (!node)
13176         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13177                     PTR2UV(flagp));
13178     if (save_fold) {
13179         RExC_flags |= RXf_PMf_FOLD;
13180     }
13181     RExC_parse = save_parse + 1;
13182     RExC_end = save_end;
13183     SvREFCNT_dec_NN(final);
13184     SvREFCNT_dec_NN(result_string);
13185
13186     nextchar(pRExC_state);
13187     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13188     return node;
13189 }
13190 #undef IS_OPERAND
13191
13192 STATIC void
13193 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13194 {
13195     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13196      * innocent-looking character class, like /[ks]/i won't have to go out to
13197      * disk to find the possible matches.
13198      *
13199      * This should be called only for a Latin1-range code points, cp, which is
13200      * known to be involved in a simple fold with other code points above
13201      * Latin1.  It would give false results if /aa has been specified.
13202      * Multi-char folds are outside the scope of this, and must be handled
13203      * specially.
13204      *
13205      * XXX It would be better to generate these via regen, in case a new
13206      * version of the Unicode standard adds new mappings, though that is not
13207      * really likely, and may be caught by the default: case of the switch
13208      * below. */
13209
13210     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13211
13212     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13213
13214     switch (cp) {
13215         case 'k':
13216         case 'K':
13217           *invlist =
13218              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13219             break;
13220         case 's':
13221         case 'S':
13222           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13223             break;
13224         case MICRO_SIGN:
13225           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13226           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13227             break;
13228         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13229         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13230           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13231             break;
13232         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13233           *invlist = add_cp_to_invlist(*invlist,
13234                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13235             break;
13236         case LATIN_SMALL_LETTER_SHARP_S:
13237           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13238             break;
13239         default:
13240             /* Use deprecated warning to increase the chances of this being
13241              * output */
13242             ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13243             break;
13244     }
13245 }
13246
13247 /* The names of properties whose definitions are not known at compile time are
13248  * stored in this SV, after a constant heading.  So if the length has been
13249  * changed since initialization, then there is a run-time definition. */
13250 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13251                                         (SvCUR(listsv) != initial_listsv_len)
13252
13253 STATIC regnode *
13254 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13255                  const bool stop_at_1,  /* Just parse the next thing, don't
13256                                            look for a full character class */
13257                  bool allow_multi_folds,
13258                  const bool silence_non_portable,   /* Don't output warnings
13259                                                        about too large
13260                                                        characters */
13261                  SV** ret_invlist)  /* Return an inversion list, not a node */
13262 {
13263     /* parse a bracketed class specification.  Most of these will produce an
13264      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13265      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13266      * under /i with multi-character folds: it will be rewritten following the
13267      * paradigm of this example, where the <multi-fold>s are characters which
13268      * fold to multiple character sequences:
13269      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13270      * gets effectively rewritten as:
13271      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13272      * reg() gets called (recursively) on the rewritten version, and this
13273      * function will return what it constructs.  (Actually the <multi-fold>s
13274      * aren't physically removed from the [abcdefghi], it's just that they are
13275      * ignored in the recursion by means of a flag:
13276      * <RExC_in_multi_char_class>.)
13277      *
13278      * ANYOF nodes contain a bit map for the first 256 characters, with the
13279      * corresponding bit set if that character is in the list.  For characters
13280      * above 255, a range list or swash is used.  There are extra bits for \w,
13281      * etc. in locale ANYOFs, as what these match is not determinable at
13282      * compile time
13283      *
13284      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13285      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13286      */
13287
13288     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13289     IV range = 0;
13290     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13291     regnode *ret;
13292     STRLEN numlen;
13293     IV namedclass = OOB_NAMEDCLASS;
13294     char *rangebegin = NULL;
13295     bool need_class = 0;
13296     SV *listsv = NULL;
13297     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13298                                       than just initialized.  */
13299     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13300     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13301                                extended beyond the Latin1 range.  These have to
13302                                be kept separate from other code points for much
13303                                of this function because their handling  is
13304                                different under /i, and for most classes under
13305                                /d as well */
13306     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13307                                separate for a while from the non-complemented
13308                                versions because of complications with /d
13309                                matching */
13310     UV element_count = 0;   /* Number of distinct elements in the class.
13311                                Optimizations may be possible if this is tiny */
13312     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13313                                        character; used under /i */
13314     UV n;
13315     char * stop_ptr = RExC_end;    /* where to stop parsing */
13316     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13317                                                    space? */
13318     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13319
13320     /* Unicode properties are stored in a swash; this holds the current one
13321      * being parsed.  If this swash is the only above-latin1 component of the
13322      * character class, an optimization is to pass it directly on to the
13323      * execution engine.  Otherwise, it is set to NULL to indicate that there
13324      * are other things in the class that have to be dealt with at execution
13325      * time */
13326     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13327
13328     /* Set if a component of this character class is user-defined; just passed
13329      * on to the engine */
13330     bool has_user_defined_property = FALSE;
13331
13332     /* inversion list of code points this node matches only when the target
13333      * string is in UTF-8.  (Because is under /d) */
13334     SV* depends_list = NULL;
13335
13336     /* Inversion list of code points this node matches regardless of things
13337      * like locale, folding, utf8ness of the target string */
13338     SV* cp_list = NULL;
13339
13340     /* Like cp_list, but code points on this list need to be checked for things
13341      * that fold to/from them under /i */
13342     SV* cp_foldable_list = NULL;
13343
13344     /* Like cp_list, but code points on this list are valid only when the
13345      * runtime locale is UTF-8 */
13346     SV* only_utf8_locale_list = NULL;
13347
13348 #ifdef EBCDIC
13349     /* In a range, counts how many 0-2 of the ends of it came from literals,
13350      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13351     UV literal_endpoint = 0;
13352 #endif
13353     bool invert = FALSE;    /* Is this class to be complemented */
13354
13355     bool warn_super = ALWAYS_WARN_SUPER;
13356
13357     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13358         case we need to change the emitted regop to an EXACT. */
13359     const char * orig_parse = RExC_parse;
13360     const SSize_t orig_size = RExC_size;
13361     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13362     GET_RE_DEBUG_FLAGS_DECL;
13363
13364     PERL_ARGS_ASSERT_REGCLASS;
13365 #ifndef DEBUGGING
13366     PERL_UNUSED_ARG(depth);
13367 #endif
13368
13369     DEBUG_PARSE("clas");
13370
13371     /* Assume we are going to generate an ANYOF node. */
13372     ret = reganode(pRExC_state, ANYOF, 0);
13373
13374     if (SIZE_ONLY) {
13375         RExC_size += ANYOF_SKIP;
13376         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13377     }
13378     else {
13379         ANYOF_FLAGS(ret) = 0;
13380
13381         RExC_emit += ANYOF_SKIP;
13382         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13383         initial_listsv_len = SvCUR(listsv);
13384         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13385     }
13386
13387     if (skip_white) {
13388         RExC_parse = regpatws(pRExC_state, RExC_parse,
13389                               FALSE /* means don't recognize comments */ );
13390     }
13391
13392     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13393         RExC_parse++;
13394         invert = TRUE;
13395         allow_multi_folds = FALSE;
13396         RExC_naughty++;
13397         if (skip_white) {
13398             RExC_parse = regpatws(pRExC_state, RExC_parse,
13399                                   FALSE /* means don't recognize comments */ );
13400         }
13401     }
13402
13403     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13404     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13405         const char *s = RExC_parse;
13406         const char  c = *s++;
13407
13408         while (isWORDCHAR(*s))
13409             s++;
13410         if (*s && c == *s && s[1] == ']') {
13411             SAVEFREESV(RExC_rx_sv);
13412             ckWARN3reg(s+2,
13413                        "POSIX syntax [%c %c] belongs inside character classes",
13414                        c, c);
13415             (void)ReREFCNT_inc(RExC_rx_sv);
13416         }
13417     }
13418
13419     /* If the caller wants us to just parse a single element, accomplish this
13420      * by faking the loop ending condition */
13421     if (stop_at_1 && RExC_end > RExC_parse) {
13422         stop_ptr = RExC_parse + 1;
13423     }
13424
13425     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13426     if (UCHARAT(RExC_parse) == ']')
13427         goto charclassloop;
13428
13429 parseit:
13430     while (1) {
13431         if  (RExC_parse >= stop_ptr) {
13432             break;
13433         }
13434
13435         if (skip_white) {
13436             RExC_parse = regpatws(pRExC_state, RExC_parse,
13437                                   FALSE /* means don't recognize comments */ );
13438         }
13439
13440         if  (UCHARAT(RExC_parse) == ']') {
13441             break;
13442         }
13443
13444     charclassloop:
13445
13446         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13447         save_value = value;
13448         save_prevvalue = prevvalue;
13449
13450         if (!range) {
13451             rangebegin = RExC_parse;
13452             element_count++;
13453         }
13454         if (UTF) {
13455             value = utf8n_to_uvchr((U8*)RExC_parse,
13456                                    RExC_end - RExC_parse,
13457                                    &numlen, UTF8_ALLOW_DEFAULT);
13458             RExC_parse += numlen;
13459         }
13460         else
13461             value = UCHARAT(RExC_parse++);
13462
13463         if (value == '['
13464             && RExC_parse < RExC_end
13465             && POSIXCC(UCHARAT(RExC_parse)))
13466         {
13467             namedclass = regpposixcc(pRExC_state, value, strict);
13468         }
13469         else if (value == '\\') {
13470             if (UTF) {
13471                 value = utf8n_to_uvchr((U8*)RExC_parse,
13472                                    RExC_end - RExC_parse,
13473                                    &numlen, UTF8_ALLOW_DEFAULT);
13474                 RExC_parse += numlen;
13475             }
13476             else
13477                 value = UCHARAT(RExC_parse++);
13478
13479             /* Some compilers cannot handle switching on 64-bit integer
13480              * values, therefore value cannot be an UV.  Yes, this will
13481              * be a problem later if we want switch on Unicode.
13482              * A similar issue a little bit later when switching on
13483              * namedclass. --jhi */
13484
13485             /* If the \ is escaping white space when white space is being
13486              * skipped, it means that that white space is wanted literally, and
13487              * is already in 'value'.  Otherwise, need to translate the escape
13488              * into what it signifies. */
13489             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13490
13491             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13492             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13493             case 's':   namedclass = ANYOF_SPACE;       break;
13494             case 'S':   namedclass = ANYOF_NSPACE;      break;
13495             case 'd':   namedclass = ANYOF_DIGIT;       break;
13496             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13497             case 'v':   namedclass = ANYOF_VERTWS;      break;
13498             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13499             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13500             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13501             case 'N':  /* Handle \N{NAME} in class */
13502                 {
13503                     /* We only pay attention to the first char of
13504                     multichar strings being returned. I kinda wonder
13505                     if this makes sense as it does change the behaviour
13506                     from earlier versions, OTOH that behaviour was broken
13507                     as well. */
13508                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13509                                       TRUE, /* => charclass */
13510                                       strict))
13511                     {
13512                         if (*flagp & RESTART_UTF8)
13513                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13514                         goto parseit;
13515                     }
13516                 }
13517                 break;
13518             case 'p':
13519             case 'P':
13520                 {
13521                 char *e;
13522
13523                 /* We will handle any undefined properties ourselves */
13524                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13525                                        /* And we actually would prefer to get
13526                                         * the straight inversion list of the
13527                                         * swash, since we will be accessing it
13528                                         * anyway, to save a little time */
13529                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13530
13531                 if (RExC_parse >= RExC_end)
13532                     vFAIL2("Empty \\%c{}", (U8)value);
13533                 if (*RExC_parse == '{') {
13534                     const U8 c = (U8)value;
13535                     e = strchr(RExC_parse++, '}');
13536                     if (!e)
13537                         vFAIL2("Missing right brace on \\%c{}", c);
13538                     while (isSPACE(*RExC_parse))
13539                         RExC_parse++;
13540                     if (e == RExC_parse)
13541                         vFAIL2("Empty \\%c{}", c);
13542                     n = e - RExC_parse;
13543                     while (isSPACE(*(RExC_parse + n - 1)))
13544                         n--;
13545                 }
13546                 else {
13547                     e = RExC_parse;
13548                     n = 1;
13549                 }
13550                 if (!SIZE_ONLY) {
13551                     SV* invlist;
13552                     char* name;
13553
13554                     if (UCHARAT(RExC_parse) == '^') {
13555                          RExC_parse++;
13556                          n--;
13557                          /* toggle.  (The rhs xor gets the single bit that
13558                           * differs between P and p; the other xor inverts just
13559                           * that bit) */
13560                          value ^= 'P' ^ 'p';
13561
13562                          while (isSPACE(*RExC_parse)) {
13563                               RExC_parse++;
13564                               n--;
13565                          }
13566                     }
13567                     /* Try to get the definition of the property into
13568                      * <invlist>.  If /i is in effect, the effective property
13569                      * will have its name be <__NAME_i>.  The design is
13570                      * discussed in commit
13571                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13572                     name = savepv(Perl_form(aTHX_
13573                                           "%s%.*s%s\n",
13574                                           (FOLD) ? "__" : "",
13575                                           (int)n,
13576                                           RExC_parse,
13577                                           (FOLD) ? "_i" : ""
13578                                 ));
13579
13580                     /* Look up the property name, and get its swash and
13581                      * inversion list, if the property is found  */
13582                     if (swash) {
13583                         SvREFCNT_dec_NN(swash);
13584                     }
13585                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13586                                              1, /* binary */
13587                                              0, /* not tr/// */
13588                                              NULL, /* No inversion list */
13589                                              &swash_init_flags
13590                                             );
13591                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13592                         HV* curpkg = (IN_PERL_COMPILETIME)
13593                                       ? PL_curstash
13594                                       : CopSTASH(PL_curcop);
13595                         if (swash) {
13596                             SvREFCNT_dec_NN(swash);
13597                             swash = NULL;
13598                         }
13599
13600                         /* Here didn't find it.  It could be a user-defined
13601                          * property that will be available at run-time.  If we
13602                          * accept only compile-time properties, is an error;
13603                          * otherwise add it to the list for run-time look up */
13604                         if (ret_invlist) {
13605                             RExC_parse = e + 1;
13606                             vFAIL2utf8f(
13607                                 "Property '%"UTF8f"' is unknown",
13608                                 UTF8fARG(UTF, n, name));
13609                         }
13610
13611                         /* If the property name doesn't already have a package
13612                          * name, add the current one to it so that it can be
13613                          * referred to outside it. [perl #121777] */
13614                         if (curpkg && ! instr(name, "::")) {
13615                             char* pkgname = HvNAME(curpkg);
13616                             if (strNE(pkgname, "main")) {
13617                                 char* full_name = Perl_form(aTHX_
13618                                                             "%s::%s",
13619                                                             pkgname,
13620                                                             name);
13621                                 n = strlen(full_name);
13622                                 Safefree(name);
13623                                 name = savepvn(full_name, n);
13624                             }
13625                         }
13626                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13627                                         (value == 'p' ? '+' : '!'),
13628                                         UTF8fARG(UTF, n, name));
13629                         has_user_defined_property = TRUE;
13630
13631                         /* We don't know yet, so have to assume that the
13632                          * property could match something in the Latin1 range,
13633                          * hence something that isn't utf8.  Note that this
13634                          * would cause things in <depends_list> to match
13635                          * inappropriately, except that any \p{}, including
13636                          * this one forces Unicode semantics, which means there
13637                          * is no <depends_list> */
13638                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13639                     }
13640                     else {
13641
13642                         /* Here, did get the swash and its inversion list.  If
13643                          * the swash is from a user-defined property, then this
13644                          * whole character class should be regarded as such */
13645                         if (swash_init_flags
13646                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13647                         {
13648                             has_user_defined_property = TRUE;
13649                         }
13650                         else if
13651                             /* We warn on matching an above-Unicode code point
13652                              * if the match would return true, except don't
13653                              * warn for \p{All}, which has exactly one element
13654                              * = 0 */
13655                             (_invlist_contains_cp(invlist, 0x110000)
13656                                 && (! (_invlist_len(invlist) == 1
13657                                        && *invlist_array(invlist) == 0)))
13658                         {
13659                             warn_super = TRUE;
13660                         }
13661
13662
13663                         /* Invert if asking for the complement */
13664                         if (value == 'P') {
13665                             _invlist_union_complement_2nd(properties,
13666                                                           invlist,
13667                                                           &properties);
13668
13669                             /* The swash can't be used as-is, because we've
13670                              * inverted things; delay removing it to here after
13671                              * have copied its invlist above */
13672                             SvREFCNT_dec_NN(swash);
13673                             swash = NULL;
13674                         }
13675                         else {
13676                             _invlist_union(properties, invlist, &properties);
13677                         }
13678                     }
13679                     Safefree(name);
13680                 }
13681                 RExC_parse = e + 1;
13682                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13683                                                 named */
13684
13685                 /* \p means they want Unicode semantics */
13686                 RExC_uni_semantics = 1;
13687                 }
13688                 break;
13689             case 'n':   value = '\n';                   break;
13690             case 'r':   value = '\r';                   break;
13691             case 't':   value = '\t';                   break;
13692             case 'f':   value = '\f';                   break;
13693             case 'b':   value = '\b';                   break;
13694             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13695             case 'a':   value = '\a';                   break;
13696             case 'o':
13697                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13698                 {
13699                     const char* error_msg;
13700                     bool valid = grok_bslash_o(&RExC_parse,
13701                                                &value,
13702                                                &error_msg,
13703                                                SIZE_ONLY,   /* warnings in pass
13704                                                                1 only */
13705                                                strict,
13706                                                silence_non_portable,
13707                                                UTF);
13708                     if (! valid) {
13709                         vFAIL(error_msg);
13710                     }
13711                 }
13712                 if (PL_encoding && value < 0x100) {
13713                     goto recode_encoding;
13714                 }
13715                 break;
13716             case 'x':
13717                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13718                 {
13719                     const char* error_msg;
13720                     bool valid = grok_bslash_x(&RExC_parse,
13721                                                &value,
13722                                                &error_msg,
13723                                                TRUE, /* Output warnings */
13724                                                strict,
13725                                                silence_non_portable,
13726                                                UTF);
13727                     if (! valid) {
13728                         vFAIL(error_msg);
13729                     }
13730                 }
13731                 if (PL_encoding && value < 0x100)
13732                     goto recode_encoding;
13733                 break;
13734             case 'c':
13735                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13736                 break;
13737             case '0': case '1': case '2': case '3': case '4':
13738             case '5': case '6': case '7':
13739                 {
13740                     /* Take 1-3 octal digits */
13741                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13742                     numlen = (strict) ? 4 : 3;
13743                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13744                     RExC_parse += numlen;
13745                     if (numlen != 3) {
13746                         if (strict) {
13747                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13748                             vFAIL("Need exactly 3 octal digits");
13749                         }
13750                         else if (! SIZE_ONLY /* like \08, \178 */
13751                                  && numlen < 3
13752                                  && RExC_parse < RExC_end
13753                                  && isDIGIT(*RExC_parse)
13754                                  && ckWARN(WARN_REGEXP))
13755                         {
13756                             SAVEFREESV(RExC_rx_sv);
13757                             reg_warn_non_literal_string(
13758                                  RExC_parse + 1,
13759                                  form_short_octal_warning(RExC_parse, numlen));
13760                             (void)ReREFCNT_inc(RExC_rx_sv);
13761                         }
13762                     }
13763                     if (PL_encoding && value < 0x100)
13764                         goto recode_encoding;
13765                     break;
13766                 }
13767             recode_encoding:
13768                 if (! RExC_override_recoding) {
13769                     SV* enc = PL_encoding;
13770                     value = reg_recode((const char)(U8)value, &enc);
13771                     if (!enc) {
13772                         if (strict) {
13773                             vFAIL("Invalid escape in the specified encoding");
13774                         }
13775                         else if (SIZE_ONLY) {
13776                             ckWARNreg(RExC_parse,
13777                                   "Invalid escape in the specified encoding");
13778                         }
13779                     }
13780                     break;
13781                 }
13782             default:
13783                 /* Allow \_ to not give an error */
13784                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13785                     if (strict) {
13786                         vFAIL2("Unrecognized escape \\%c in character class",
13787                                (int)value);
13788                     }
13789                     else {
13790                         SAVEFREESV(RExC_rx_sv);
13791                         ckWARN2reg(RExC_parse,
13792                             "Unrecognized escape \\%c in character class passed through",
13793                             (int)value);
13794                         (void)ReREFCNT_inc(RExC_rx_sv);
13795                     }
13796                 }
13797                 break;
13798             }   /* End of switch on char following backslash */
13799         } /* end of handling backslash escape sequences */
13800 #ifdef EBCDIC
13801         else
13802             literal_endpoint++;
13803 #endif
13804
13805         /* Here, we have the current token in 'value' */
13806
13807         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13808             U8 classnum;
13809
13810             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13811              * literal, as is the character that began the false range, i.e.
13812              * the 'a' in the examples */
13813             if (range) {
13814                 if (!SIZE_ONLY) {
13815                     const int w = (RExC_parse >= rangebegin)
13816                                   ? RExC_parse - rangebegin
13817                                   : 0;
13818                     if (strict) {
13819                         vFAIL2utf8f(
13820                             "False [] range \"%"UTF8f"\"",
13821                             UTF8fARG(UTF, w, rangebegin));
13822                     }
13823                     else {
13824                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13825                         ckWARN2reg(RExC_parse,
13826                             "False [] range \"%"UTF8f"\"",
13827                             UTF8fARG(UTF, w, rangebegin));
13828                         (void)ReREFCNT_inc(RExC_rx_sv);
13829                         cp_list = add_cp_to_invlist(cp_list, '-');
13830                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13831                                                              prevvalue);
13832                     }
13833                 }
13834
13835                 range = 0; /* this was not a true range */
13836                 element_count += 2; /* So counts for three values */
13837             }
13838
13839             classnum = namedclass_to_classnum(namedclass);
13840
13841             if (LOC && namedclass < ANYOF_POSIXL_MAX
13842 #ifndef HAS_ISASCII
13843                 && classnum != _CC_ASCII
13844 #endif
13845             ) {
13846                 /* What the Posix classes (like \w, [:space:]) match in locale
13847                  * isn't knowable under locale until actual match time.  Room
13848                  * must be reserved (one time per outer bracketed class) to
13849                  * store such classes.  The space will contain a bit for each
13850                  * named class that is to be matched against.  This isn't
13851                  * needed for \p{} and pseudo-classes, as they are not affected
13852                  * by locale, and hence are dealt with separately */
13853                 if (! need_class) {
13854                     need_class = 1;
13855                     if (SIZE_ONLY) {
13856                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13857                     }
13858                     else {
13859                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13860                     }
13861                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13862                     ANYOF_POSIXL_ZERO(ret);
13863                 }
13864
13865                 /* Coverity thinks it is possible for this to be negative; both
13866                  * jhi and khw think it's not, but be safer */
13867                 assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13868                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13869
13870                 /* See if it already matches the complement of this POSIX
13871                  * class */
13872                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13873                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13874                                                             ? -1
13875                                                             : 1)))
13876                 {
13877                     posixl_matches_all = TRUE;
13878                     break;  /* No need to continue.  Since it matches both
13879                                e.g., \w and \W, it matches everything, and the
13880                                bracketed class can be optimized into qr/./s */
13881                 }
13882
13883                 /* Add this class to those that should be checked at runtime */
13884                 ANYOF_POSIXL_SET(ret, namedclass);
13885
13886                 /* The above-Latin1 characters are not subject to locale rules.
13887                  * Just add them, in the second pass, to the
13888                  * unconditionally-matched list */
13889                 if (! SIZE_ONLY) {
13890                     SV* scratch_list = NULL;
13891
13892                     /* Get the list of the above-Latin1 code points this
13893                      * matches */
13894                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13895                                           PL_XPosix_ptrs[classnum],
13896
13897                                           /* Odd numbers are complements, like
13898                                            * NDIGIT, NASCII, ... */
13899                                           namedclass % 2 != 0,
13900                                           &scratch_list);
13901                     /* Checking if 'cp_list' is NULL first saves an extra
13902                      * clone.  Its reference count will be decremented at the
13903                      * next union, etc, or if this is the only instance, at the
13904                      * end of the routine */
13905                     if (! cp_list) {
13906                         cp_list = scratch_list;
13907                     }
13908                     else {
13909                         _invlist_union(cp_list, scratch_list, &cp_list);
13910                         SvREFCNT_dec_NN(scratch_list);
13911                     }
13912                     continue;   /* Go get next character */
13913                 }
13914             }
13915             else if (! SIZE_ONLY) {
13916
13917                 /* Here, not in pass1 (in that pass we skip calculating the
13918                  * contents of this class), and is /l, or is a POSIX class for
13919                  * which /l doesn't matter (or is a Unicode property, which is
13920                  * skipped here). */
13921                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13922                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13923
13924                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13925                          * nor /l make a difference in what these match,
13926                          * therefore we just add what they match to cp_list. */
13927                         if (classnum != _CC_VERTSPACE) {
13928                             assert(   namedclass == ANYOF_HORIZWS
13929                                    || namedclass == ANYOF_NHORIZWS);
13930
13931                             /* It turns out that \h is just a synonym for
13932                              * XPosixBlank */
13933                             classnum = _CC_BLANK;
13934                         }
13935
13936                         _invlist_union_maybe_complement_2nd(
13937                                 cp_list,
13938                                 PL_XPosix_ptrs[classnum],
13939                                 namedclass % 2 != 0,    /* Complement if odd
13940                                                           (NHORIZWS, NVERTWS)
13941                                                         */
13942                                 &cp_list);
13943                     }
13944                 }
13945                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13946                            complement and use nposixes */
13947                     SV** posixes_ptr = namedclass % 2 == 0
13948                                        ? &posixes
13949                                        : &nposixes;
13950                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13951                     _invlist_union_maybe_complement_2nd(
13952                                                      *posixes_ptr,
13953                                                      *source_ptr,
13954                                                      namedclass % 2 != 0,
13955                                                      posixes_ptr);
13956                 }
13957                 continue;   /* Go get next character */
13958             }
13959         } /* end of namedclass \blah */
13960
13961         /* Here, we have a single value.  If 'range' is set, it is the ending
13962          * of a range--check its validity.  Later, we will handle each
13963          * individual code point in the range.  If 'range' isn't set, this
13964          * could be the beginning of a range, so check for that by looking
13965          * ahead to see if the next real character to be processed is the range
13966          * indicator--the minus sign */
13967
13968         if (skip_white) {
13969             RExC_parse = regpatws(pRExC_state, RExC_parse,
13970                                 FALSE /* means don't recognize comments */ );
13971         }
13972
13973         if (range) {
13974             if (prevvalue > value) /* b-a */ {
13975                 const int w = RExC_parse - rangebegin;
13976                 vFAIL2utf8f(
13977                     "Invalid [] range \"%"UTF8f"\"",
13978                     UTF8fARG(UTF, w, rangebegin));
13979                 range = 0; /* not a valid range */
13980             }
13981         }
13982         else {
13983             prevvalue = value; /* save the beginning of the potential range */
13984             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13985                 && *RExC_parse == '-')
13986             {
13987                 char* next_char_ptr = RExC_parse + 1;
13988                 if (skip_white) {   /* Get the next real char after the '-' */
13989                     next_char_ptr = regpatws(pRExC_state,
13990                                              RExC_parse + 1,
13991                                              FALSE); /* means don't recognize
13992                                                         comments */
13993                 }
13994
13995                 /* If the '-' is at the end of the class (just before the ']',
13996                  * it is a literal minus; otherwise it is a range */
13997                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13998                     RExC_parse = next_char_ptr;
13999
14000                     /* a bad range like \w-, [:word:]- ? */
14001                     if (namedclass > OOB_NAMEDCLASS) {
14002                         if (strict || ckWARN(WARN_REGEXP)) {
14003                             const int w =
14004                                 RExC_parse >= rangebegin ?
14005                                 RExC_parse - rangebegin : 0;
14006                             if (strict) {
14007                                 vFAIL4("False [] range \"%*.*s\"",
14008                                     w, w, rangebegin);
14009                             }
14010                             else {
14011                                 vWARN4(RExC_parse,
14012                                     "False [] range \"%*.*s\"",
14013                                     w, w, rangebegin);
14014                             }
14015                         }
14016                         if (!SIZE_ONLY) {
14017                             cp_list = add_cp_to_invlist(cp_list, '-');
14018                         }
14019                         element_count++;
14020                     } else
14021                         range = 1;      /* yeah, it's a range! */
14022                     continue;   /* but do it the next time */
14023                 }
14024             }
14025         }
14026
14027         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14028          * if not */
14029
14030         /* non-Latin1 code point implies unicode semantics.  Must be set in
14031          * pass1 so is there for the whole of pass 2 */
14032         if (value > 255) {
14033             RExC_uni_semantics = 1;
14034         }
14035
14036         /* Ready to process either the single value, or the completed range.
14037          * For single-valued non-inverted ranges, we consider the possibility
14038          * of multi-char folds.  (We made a conscious decision to not do this
14039          * for the other cases because it can often lead to non-intuitive
14040          * results.  For example, you have the peculiar case that:
14041          *  "s s" =~ /^[^\xDF]+$/i => Y
14042          *  "ss"  =~ /^[^\xDF]+$/i => N
14043          *
14044          * See [perl #89750] */
14045         if (FOLD && allow_multi_folds && value == prevvalue) {
14046             if (value == LATIN_SMALL_LETTER_SHARP_S
14047                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14048                                                         value)))
14049             {
14050                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14051
14052                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14053                 STRLEN foldlen;
14054
14055                 UV folded = _to_uni_fold_flags(
14056                                 value,
14057                                 foldbuf,
14058                                 &foldlen,
14059                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14060                                                    ? FOLD_FLAGS_NOMIX_ASCII
14061                                                    : 0)
14062                                 );
14063
14064                 /* Here, <folded> should be the first character of the
14065                  * multi-char fold of <value>, with <foldbuf> containing the
14066                  * whole thing.  But, if this fold is not allowed (because of
14067                  * the flags), <fold> will be the same as <value>, and should
14068                  * be processed like any other character, so skip the special
14069                  * handling */
14070                 if (folded != value) {
14071
14072                     /* Skip if we are recursed, currently parsing the class
14073                      * again.  Otherwise add this character to the list of
14074                      * multi-char folds. */
14075                     if (! RExC_in_multi_char_class) {
14076                         AV** this_array_ptr;
14077                         AV* this_array;
14078                         STRLEN cp_count = utf8_length(foldbuf,
14079                                                       foldbuf + foldlen);
14080                         SV* multi_fold = sv_2mortal(newSVpvs(""));
14081
14082                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14083
14084
14085                         if (! multi_char_matches) {
14086                             multi_char_matches = newAV();
14087                         }
14088
14089                         /* <multi_char_matches> is actually an array of arrays.
14090                          * There will be one or two top-level elements: [2],
14091                          * and/or [3].  The [2] element is an array, each
14092                          * element thereof is a character which folds to TWO
14093                          * characters; [3] is for folds to THREE characters.
14094                          * (Unicode guarantees a maximum of 3 characters in any
14095                          * fold.)  When we rewrite the character class below,
14096                          * we will do so such that the longest folds are
14097                          * written first, so that it prefers the longest
14098                          * matching strings first.  This is done even if it
14099                          * turns out that any quantifier is non-greedy, out of
14100                          * programmer laziness.  Tom Christiansen has agreed
14101                          * that this is ok.  This makes the test for the
14102                          * ligature 'ffi' come before the test for 'ff' */
14103                         if (av_exists(multi_char_matches, cp_count)) {
14104                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
14105                                                              cp_count, FALSE);
14106                             this_array = *this_array_ptr;
14107                         }
14108                         else {
14109                             this_array = newAV();
14110                             av_store(multi_char_matches, cp_count,
14111                                      (SV*) this_array);
14112                         }
14113                         av_push(this_array, multi_fold);
14114                     }
14115
14116                     /* This element should not be processed further in this
14117                      * class */
14118                     element_count--;
14119                     value = save_value;
14120                     prevvalue = save_prevvalue;
14121                     continue;
14122                 }
14123             }
14124         }
14125
14126         /* Deal with this element of the class */
14127         if (! SIZE_ONLY) {
14128 #ifndef EBCDIC
14129             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14130                                                      prevvalue, value);
14131 #else
14132             SV* this_range = _new_invlist(1);
14133             _append_range_to_invlist(this_range, prevvalue, value);
14134
14135             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14136              * If this range was specified using something like 'i-j', we want
14137              * to include only the 'i' and the 'j', and not anything in
14138              * between, so exclude non-ASCII, non-alphabetics from it.
14139              * However, if the range was specified with something like
14140              * [\x89-\x91] or [\x89-j], all code points within it should be
14141              * included.  literal_endpoint==2 means both ends of the range used
14142              * a literal character, not \x{foo} */
14143             if (literal_endpoint == 2
14144                 && ((prevvalue >= 'a' && value <= 'z')
14145                     || (prevvalue >= 'A' && value <= 'Z')))
14146             {
14147                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14148                                       &this_range);
14149
14150                 /* Since this above only contains ascii, the intersection of it
14151                  * with anything will still yield only ascii */
14152                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14153                                       &this_range);
14154             }
14155             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14156             literal_endpoint = 0;
14157 #endif
14158         }
14159
14160         range = 0; /* this range (if it was one) is done now */
14161     } /* End of loop through all the text within the brackets */
14162
14163     /* If anything in the class expands to more than one character, we have to
14164      * deal with them by building up a substitute parse string, and recursively
14165      * calling reg() on it, instead of proceeding */
14166     if (multi_char_matches) {
14167         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14168         I32 cp_count;
14169         STRLEN len;
14170         char *save_end = RExC_end;
14171         char *save_parse = RExC_parse;
14172         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14173                                        a "|" */
14174         I32 reg_flags;
14175
14176         assert(! invert);
14177 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14178            because too confusing */
14179         if (invert) {
14180             sv_catpv(substitute_parse, "(?:");
14181         }
14182 #endif
14183
14184         /* Look at the longest folds first */
14185         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14186
14187             if (av_exists(multi_char_matches, cp_count)) {
14188                 AV** this_array_ptr;
14189                 SV* this_sequence;
14190
14191                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14192                                                  cp_count, FALSE);
14193                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14194                                                                 &PL_sv_undef)
14195                 {
14196                     if (! first_time) {
14197                         sv_catpv(substitute_parse, "|");
14198                     }
14199                     first_time = FALSE;
14200
14201                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14202                 }
14203             }
14204         }
14205
14206         /* If the character class contains anything else besides these
14207          * multi-character folds, have to include it in recursive parsing */
14208         if (element_count) {
14209             sv_catpv(substitute_parse, "|[");
14210             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14211             sv_catpv(substitute_parse, "]");
14212         }
14213
14214         sv_catpv(substitute_parse, ")");
14215 #if 0
14216         if (invert) {
14217             /* This is a way to get the parse to skip forward a whole named
14218              * sequence instead of matching the 2nd character when it fails the
14219              * first */
14220             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14221         }
14222 #endif
14223
14224         RExC_parse = SvPV(substitute_parse, len);
14225         RExC_end = RExC_parse + len;
14226         RExC_in_multi_char_class = 1;
14227         RExC_emit = (regnode *)orig_emit;
14228
14229         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14230
14231         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14232
14233         RExC_parse = save_parse;
14234         RExC_end = save_end;
14235         RExC_in_multi_char_class = 0;
14236         SvREFCNT_dec_NN(multi_char_matches);
14237         return ret;
14238     }
14239
14240     /* Here, we've gone through the entire class and dealt with multi-char
14241      * folds.  We are now in a position that we can do some checks to see if we
14242      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14243      * Currently we only do two checks:
14244      * 1) is in the unlikely event that the user has specified both, eg. \w and
14245      *    \W under /l, then the class matches everything.  (This optimization
14246      *    is done only to make the optimizer code run later work.)
14247      * 2) if the character class contains only a single element (including a
14248      *    single range), we see if there is an equivalent node for it.
14249      * Other checks are possible */
14250     if (! ret_invlist   /* Can't optimize if returning the constructed
14251                            inversion list */
14252         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14253     {
14254         U8 op = END;
14255         U8 arg = 0;
14256
14257         if (UNLIKELY(posixl_matches_all)) {
14258             op = SANY;
14259         }
14260         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14261                                                    \w or [:digit:] or \p{foo}
14262                                                  */
14263
14264             /* All named classes are mapped into POSIXish nodes, with its FLAG
14265              * argument giving which class it is */
14266             switch ((I32)namedclass) {
14267                 case ANYOF_UNIPROP:
14268                     break;
14269
14270                 /* These don't depend on the charset modifiers.  They always
14271                  * match under /u rules */
14272                 case ANYOF_NHORIZWS:
14273                 case ANYOF_HORIZWS:
14274                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14275                     /* FALLTHROUGH */
14276
14277                 case ANYOF_NVERTWS:
14278                 case ANYOF_VERTWS:
14279                     op = POSIXU;
14280                     goto join_posix;
14281
14282                 /* The actual POSIXish node for all the rest depends on the
14283                  * charset modifier.  The ones in the first set depend only on
14284                  * ASCII or, if available on this platform, locale */
14285                 case ANYOF_ASCII:
14286                 case ANYOF_NASCII:
14287 #ifdef HAS_ISASCII
14288                     op = (LOC) ? POSIXL : POSIXA;
14289 #else
14290                     op = POSIXA;
14291 #endif
14292                     goto join_posix;
14293
14294                 case ANYOF_NCASED:
14295                 case ANYOF_LOWER:
14296                 case ANYOF_NLOWER:
14297                 case ANYOF_UPPER:
14298                 case ANYOF_NUPPER:
14299                     /* under /a could be alpha */
14300                     if (FOLD) {
14301                         if (ASCII_RESTRICTED) {
14302                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14303                         }
14304                         else if (! LOC) {
14305                             break;
14306                         }
14307                     }
14308                     /* FALLTHROUGH */
14309
14310                 /* The rest have more possibilities depending on the charset.
14311                  * We take advantage of the enum ordering of the charset
14312                  * modifiers to get the exact node type, */
14313                 default:
14314                     op = POSIXD + get_regex_charset(RExC_flags);
14315                     if (op > POSIXA) { /* /aa is same as /a */
14316                         op = POSIXA;
14317                     }
14318
14319                 join_posix:
14320                     /* The odd numbered ones are the complements of the
14321                      * next-lower even number one */
14322                     if (namedclass % 2 == 1) {
14323                         invert = ! invert;
14324                         namedclass--;
14325                     }
14326                     arg = namedclass_to_classnum(namedclass);
14327                     break;
14328             }
14329         }
14330         else if (value == prevvalue) {
14331
14332             /* Here, the class consists of just a single code point */
14333
14334             if (invert) {
14335                 if (! LOC && value == '\n') {
14336                     op = REG_ANY; /* Optimize [^\n] */
14337                     *flagp |= HASWIDTH|SIMPLE;
14338                     RExC_naughty++;
14339                 }
14340             }
14341             else if (value < 256 || UTF) {
14342
14343                 /* Optimize a single value into an EXACTish node, but not if it
14344                  * would require converting the pattern to UTF-8. */
14345                 op = compute_EXACTish(pRExC_state);
14346             }
14347         } /* Otherwise is a range */
14348         else if (! LOC) {   /* locale could vary these */
14349             if (prevvalue == '0') {
14350                 if (value == '9') {
14351                     arg = _CC_DIGIT;
14352                     op = POSIXA;
14353                 }
14354             }
14355             else if (prevvalue == 'A') {
14356                 if (value == 'Z'
14357 #ifdef EBCDIC
14358                     && literal_endpoint == 2
14359 #endif
14360                 ) {
14361                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14362                     op = POSIXA;
14363                 }
14364             }
14365             else if (prevvalue == 'a') {
14366                 if (value == 'z'
14367 #ifdef EBCDIC
14368                     && literal_endpoint == 2
14369 #endif
14370                 ) {
14371                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14372                     op = POSIXA;
14373                 }
14374             }
14375         }
14376
14377         /* Here, we have changed <op> away from its initial value iff we found
14378          * an optimization */
14379         if (op != END) {
14380
14381             /* Throw away this ANYOF regnode, and emit the calculated one,
14382              * which should correspond to the beginning, not current, state of
14383              * the parse */
14384             const char * cur_parse = RExC_parse;
14385             RExC_parse = (char *)orig_parse;
14386             if ( SIZE_ONLY) {
14387                 if (! LOC) {
14388
14389                     /* To get locale nodes to not use the full ANYOF size would
14390                      * require moving the code above that writes the portions
14391                      * of it that aren't in other nodes to after this point.
14392                      * e.g.  ANYOF_POSIXL_SET */
14393                     RExC_size = orig_size;
14394                 }
14395             }
14396             else {
14397                 RExC_emit = (regnode *)orig_emit;
14398                 if (PL_regkind[op] == POSIXD) {
14399                     if (op == POSIXL) {
14400                         RExC_contains_locale = 1;
14401                     }
14402                     if (invert) {
14403                         op += NPOSIXD - POSIXD;
14404                     }
14405                 }
14406             }
14407
14408             ret = reg_node(pRExC_state, op);
14409
14410             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14411                 if (! SIZE_ONLY) {
14412                     FLAGS(ret) = arg;
14413                 }
14414                 *flagp |= HASWIDTH|SIMPLE;
14415             }
14416             else if (PL_regkind[op] == EXACT) {
14417                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14418                                            TRUE /* downgradable to EXACT */
14419                                            );
14420             }
14421
14422             RExC_parse = (char *) cur_parse;
14423
14424             SvREFCNT_dec(posixes);
14425             SvREFCNT_dec(nposixes);
14426             SvREFCNT_dec(cp_list);
14427             SvREFCNT_dec(cp_foldable_list);
14428             return ret;
14429         }
14430     }
14431
14432     if (SIZE_ONLY)
14433         return ret;
14434     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14435
14436     /* If folding, we calculate all characters that could fold to or from the
14437      * ones already on the list */
14438     if (cp_foldable_list) {
14439         if (FOLD) {
14440             UV start, end;      /* End points of code point ranges */
14441
14442             SV* fold_intersection = NULL;
14443             SV** use_list;
14444
14445             /* Our calculated list will be for Unicode rules.  For locale
14446              * matching, we have to keep a separate list that is consulted at
14447              * runtime only when the locale indicates Unicode rules.  For
14448              * non-locale, we just use to the general list */
14449             if (LOC) {
14450                 use_list = &only_utf8_locale_list;
14451             }
14452             else {
14453                 use_list = &cp_list;
14454             }
14455
14456             /* Only the characters in this class that participate in folds need
14457              * be checked.  Get the intersection of this class and all the
14458              * possible characters that are foldable.  This can quickly narrow
14459              * down a large class */
14460             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14461                                   &fold_intersection);
14462
14463             /* The folds for all the Latin1 characters are hard-coded into this
14464              * program, but we have to go out to disk to get the others. */
14465             if (invlist_highest(cp_foldable_list) >= 256) {
14466
14467                 /* This is a hash that for a particular fold gives all
14468                  * characters that are involved in it */
14469                 if (! PL_utf8_foldclosures) {
14470                     _load_PL_utf8_foldclosures();
14471                 }
14472             }
14473
14474             /* Now look at the foldable characters in this class individually */
14475             invlist_iterinit(fold_intersection);
14476             while (invlist_iternext(fold_intersection, &start, &end)) {
14477                 UV j;
14478
14479                 /* Look at every character in the range */
14480                 for (j = start; j <= end; j++) {
14481                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14482                     STRLEN foldlen;
14483                     SV** listp;
14484
14485                     if (j < 256) {
14486
14487                         if (IS_IN_SOME_FOLD_L1(j)) {
14488
14489                             /* ASCII is always matched; non-ASCII is matched
14490                              * only under Unicode rules (which could happen
14491                              * under /l if the locale is a UTF-8 one */
14492                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14493                                 *use_list = add_cp_to_invlist(*use_list,
14494                                                             PL_fold_latin1[j]);
14495                             }
14496                             else {
14497                                 depends_list =
14498                                  add_cp_to_invlist(depends_list,
14499                                                    PL_fold_latin1[j]);
14500                             }
14501                         }
14502
14503                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14504                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14505                         {
14506                             add_above_Latin1_folds(pRExC_state,
14507                                                    (U8) j,
14508                                                    use_list);
14509                         }
14510                         continue;
14511                     }
14512
14513                     /* Here is an above Latin1 character.  We don't have the
14514                      * rules hard-coded for it.  First, get its fold.  This is
14515                      * the simple fold, as the multi-character folds have been
14516                      * handled earlier and separated out */
14517                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14518                                                         (ASCII_FOLD_RESTRICTED)
14519                                                         ? FOLD_FLAGS_NOMIX_ASCII
14520                                                         : 0);
14521
14522                     /* Single character fold of above Latin1.  Add everything in
14523                     * its fold closure to the list that this node should match.
14524                     * The fold closures data structure is a hash with the keys
14525                     * being the UTF-8 of every character that is folded to, like
14526                     * 'k', and the values each an array of all code points that
14527                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14528                     * Multi-character folds are not included */
14529                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14530                                         (char *) foldbuf, foldlen, FALSE)))
14531                     {
14532                         AV* list = (AV*) *listp;
14533                         IV k;
14534                         for (k = 0; k <= av_tindex(list); k++) {
14535                             SV** c_p = av_fetch(list, k, FALSE);
14536                             UV c;
14537                             assert(c_p);
14538
14539                             c = SvUV(*c_p);
14540
14541                             /* /aa doesn't allow folds between ASCII and non- */
14542                             if ((ASCII_FOLD_RESTRICTED
14543                                 && (isASCII(c) != isASCII(j))))
14544                             {
14545                                 continue;
14546                             }
14547
14548                             /* Folds under /l which cross the 255/256 boundary
14549                              * are added to a separate list.  (These are valid
14550                              * only when the locale is UTF-8.) */
14551                             if (c < 256 && LOC) {
14552                                 *use_list = add_cp_to_invlist(*use_list, c);
14553                                 continue;
14554                             }
14555
14556                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14557                             {
14558                                 cp_list = add_cp_to_invlist(cp_list, c);
14559                             }
14560                             else {
14561                                 /* Similarly folds involving non-ascii Latin1
14562                                 * characters under /d are added to their list */
14563                                 depends_list = add_cp_to_invlist(depends_list,
14564                                                                  c);
14565                             }
14566                         }
14567                     }
14568                 }
14569             }
14570             SvREFCNT_dec_NN(fold_intersection);
14571         }
14572
14573         /* Now that we have finished adding all the folds, there is no reason
14574          * to keep the foldable list separate */
14575         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14576         SvREFCNT_dec_NN(cp_foldable_list);
14577     }
14578
14579     /* And combine the result (if any) with any inversion list from posix
14580      * classes.  The lists are kept separate up to now because we don't want to
14581      * fold the classes (folding of those is automatically handled by the swash
14582      * fetching code) */
14583     if (posixes || nposixes) {
14584         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14585             /* Under /a and /aa, nothing above ASCII matches these */
14586             _invlist_intersection(posixes,
14587                                   PL_XPosix_ptrs[_CC_ASCII],
14588                                   &posixes);
14589         }
14590         if (nposixes) {
14591             if (DEPENDS_SEMANTICS) {
14592                 /* Under /d, everything in the upper half of the Latin1 range
14593                  * matches these complements */
14594                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14595             }
14596             else if (AT_LEAST_ASCII_RESTRICTED) {
14597                 /* Under /a and /aa, everything above ASCII matches these
14598                  * complements */
14599                 _invlist_union_complement_2nd(nposixes,
14600                                               PL_XPosix_ptrs[_CC_ASCII],
14601                                               &nposixes);
14602             }
14603             if (posixes) {
14604                 _invlist_union(posixes, nposixes, &posixes);
14605                 SvREFCNT_dec_NN(nposixes);
14606             }
14607             else {
14608                 posixes = nposixes;
14609             }
14610         }
14611         if (! DEPENDS_SEMANTICS) {
14612             if (cp_list) {
14613                 _invlist_union(cp_list, posixes, &cp_list);
14614                 SvREFCNT_dec_NN(posixes);
14615             }
14616             else {
14617                 cp_list = posixes;
14618             }
14619         }
14620         else {
14621             /* Under /d, we put into a separate list the Latin1 things that
14622              * match only when the target string is utf8 */
14623             SV* nonascii_but_latin1_properties = NULL;
14624             _invlist_intersection(posixes, PL_UpperLatin1,
14625                                   &nonascii_but_latin1_properties);
14626             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14627                               &posixes);
14628             if (cp_list) {
14629                 _invlist_union(cp_list, posixes, &cp_list);
14630                 SvREFCNT_dec_NN(posixes);
14631             }
14632             else {
14633                 cp_list = posixes;
14634             }
14635
14636             if (depends_list) {
14637                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14638                                &depends_list);
14639                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14640             }
14641             else {
14642                 depends_list = nonascii_but_latin1_properties;
14643             }
14644         }
14645     }
14646
14647     /* And combine the result (if any) with any inversion list from properties.
14648      * The lists are kept separate up to now so that we can distinguish the two
14649      * in regards to matching above-Unicode.  A run-time warning is generated
14650      * if a Unicode property is matched against a non-Unicode code point. But,
14651      * we allow user-defined properties to match anything, without any warning,
14652      * and we also suppress the warning if there is a portion of the character
14653      * class that isn't a Unicode property, and which matches above Unicode, \W
14654      * or [\x{110000}] for example.
14655      * (Note that in this case, unlike the Posix one above, there is no
14656      * <depends_list>, because having a Unicode property forces Unicode
14657      * semantics */
14658     if (properties) {
14659         if (cp_list) {
14660
14661             /* If it matters to the final outcome, see if a non-property
14662              * component of the class matches above Unicode.  If so, the
14663              * warning gets suppressed.  This is true even if just a single
14664              * such code point is specified, as though not strictly correct if
14665              * another such code point is matched against, the fact that they
14666              * are using above-Unicode code points indicates they should know
14667              * the issues involved */
14668             if (warn_super) {
14669                 warn_super = ! (invert
14670                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14671             }
14672
14673             _invlist_union(properties, cp_list, &cp_list);
14674             SvREFCNT_dec_NN(properties);
14675         }
14676         else {
14677             cp_list = properties;
14678         }
14679
14680         if (warn_super) {
14681             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14682         }
14683     }
14684
14685     /* Here, we have calculated what code points should be in the character
14686      * class.
14687      *
14688      * Now we can see about various optimizations.  Fold calculation (which we
14689      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14690      * would invert to include K, which under /i would match k, which it
14691      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14692      * folded until runtime */
14693
14694     /* If we didn't do folding, it's because some information isn't available
14695      * until runtime; set the run-time fold flag for these.  (We don't have to
14696      * worry about properties folding, as that is taken care of by the swash
14697      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14698      * locales, or the class matches at least one 0-255 range code point */
14699     if (LOC && FOLD) {
14700         if (only_utf8_locale_list) {
14701             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14702         }
14703         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14704                                the list */
14705             UV start, end;
14706             invlist_iterinit(cp_list);
14707             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14708                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14709             }
14710             invlist_iterfinish(cp_list);
14711         }
14712     }
14713
14714     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14715      * at compile time.  Besides not inverting folded locale now, we can't
14716      * invert if there are things such as \w, which aren't known until runtime
14717      * */
14718     if (cp_list
14719         && invert
14720         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14721         && ! depends_list
14722         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14723     {
14724         _invlist_invert(cp_list);
14725
14726         /* Any swash can't be used as-is, because we've inverted things */
14727         if (swash) {
14728             SvREFCNT_dec_NN(swash);
14729             swash = NULL;
14730         }
14731
14732         /* Clear the invert flag since have just done it here */
14733         invert = FALSE;
14734     }
14735
14736     if (ret_invlist) {
14737         *ret_invlist = cp_list;
14738         SvREFCNT_dec(swash);
14739
14740         /* Discard the generated node */
14741         if (SIZE_ONLY) {
14742             RExC_size = orig_size;
14743         }
14744         else {
14745             RExC_emit = orig_emit;
14746         }
14747         return orig_emit;
14748     }
14749
14750     /* Some character classes are equivalent to other nodes.  Such nodes take
14751      * up less room and generally fewer operations to execute than ANYOF nodes.
14752      * Above, we checked for and optimized into some such equivalents for
14753      * certain common classes that are easy to test.  Getting to this point in
14754      * the code means that the class didn't get optimized there.  Since this
14755      * code is only executed in Pass 2, it is too late to save space--it has
14756      * been allocated in Pass 1, and currently isn't given back.  But turning
14757      * things into an EXACTish node can allow the optimizer to join it to any
14758      * adjacent such nodes.  And if the class is equivalent to things like /./,
14759      * expensive run-time swashes can be avoided.  Now that we have more
14760      * complete information, we can find things necessarily missed by the
14761      * earlier code.  I (khw) am not sure how much to look for here.  It would
14762      * be easy, but perhaps too slow, to check any candidates against all the
14763      * node types they could possibly match using _invlistEQ(). */
14764
14765     if (cp_list
14766         && ! invert
14767         && ! depends_list
14768         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14769         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14770
14771            /* We don't optimize if we are supposed to make sure all non-Unicode
14772             * code points raise a warning, as only ANYOF nodes have this check.
14773             * */
14774         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14775     {
14776         UV start, end;
14777         U8 op = END;  /* The optimzation node-type */
14778         const char * cur_parse= RExC_parse;
14779
14780         invlist_iterinit(cp_list);
14781         if (! invlist_iternext(cp_list, &start, &end)) {
14782
14783             /* Here, the list is empty.  This happens, for example, when a
14784              * Unicode property is the only thing in the character class, and
14785              * it doesn't match anything.  (perluniprops.pod notes such
14786              * properties) */
14787             op = OPFAIL;
14788             *flagp |= HASWIDTH|SIMPLE;
14789         }
14790         else if (start == end) {    /* The range is a single code point */
14791             if (! invlist_iternext(cp_list, &start, &end)
14792
14793                     /* Don't do this optimization if it would require changing
14794                      * the pattern to UTF-8 */
14795                 && (start < 256 || UTF))
14796             {
14797                 /* Here, the list contains a single code point.  Can optimize
14798                  * into an EXACTish node */
14799
14800                 value = start;
14801
14802                 if (! FOLD) {
14803                     op = EXACT;
14804                 }
14805                 else if (LOC) {
14806
14807                     /* A locale node under folding with one code point can be
14808                      * an EXACTFL, as its fold won't be calculated until
14809                      * runtime */
14810                     op = EXACTFL;
14811                 }
14812                 else {
14813
14814                     /* Here, we are generally folding, but there is only one
14815                      * code point to match.  If we have to, we use an EXACT
14816                      * node, but it would be better for joining with adjacent
14817                      * nodes in the optimization pass if we used the same
14818                      * EXACTFish node that any such are likely to be.  We can
14819                      * do this iff the code point doesn't participate in any
14820                      * folds.  For example, an EXACTF of a colon is the same as
14821                      * an EXACT one, since nothing folds to or from a colon. */
14822                     if (value < 256) {
14823                         if (IS_IN_SOME_FOLD_L1(value)) {
14824                             op = EXACT;
14825                         }
14826                     }
14827                     else {
14828                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14829                             op = EXACT;
14830                         }
14831                     }
14832
14833                     /* If we haven't found the node type, above, it means we
14834                      * can use the prevailing one */
14835                     if (op == END) {
14836                         op = compute_EXACTish(pRExC_state);
14837                     }
14838                 }
14839             }
14840         }
14841         else if (start == 0) {
14842             if (end == UV_MAX) {
14843                 op = SANY;
14844                 *flagp |= HASWIDTH|SIMPLE;
14845                 RExC_naughty++;
14846             }
14847             else if (end == '\n' - 1
14848                     && invlist_iternext(cp_list, &start, &end)
14849                     && start == '\n' + 1 && end == UV_MAX)
14850             {
14851                 op = REG_ANY;
14852                 *flagp |= HASWIDTH|SIMPLE;
14853                 RExC_naughty++;
14854             }
14855         }
14856         invlist_iterfinish(cp_list);
14857
14858         if (op != END) {
14859             RExC_parse = (char *)orig_parse;
14860             RExC_emit = (regnode *)orig_emit;
14861
14862             ret = reg_node(pRExC_state, op);
14863
14864             RExC_parse = (char *)cur_parse;
14865
14866             if (PL_regkind[op] == EXACT) {
14867                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14868                                            TRUE /* downgradable to EXACT */
14869                                           );
14870             }
14871
14872             SvREFCNT_dec_NN(cp_list);
14873             return ret;
14874         }
14875     }
14876
14877     /* Here, <cp_list> contains all the code points we can determine at
14878      * compile time that match under all conditions.  Go through it, and
14879      * for things that belong in the bitmap, put them there, and delete from
14880      * <cp_list>.  While we are at it, see if everything above 255 is in the
14881      * list, and if so, set a flag to speed up execution */
14882
14883     populate_ANYOF_from_invlist(ret, &cp_list);
14884
14885     if (invert) {
14886         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14887     }
14888
14889     /* Here, the bitmap has been populated with all the Latin1 code points that
14890      * always match.  Can now add to the overall list those that match only
14891      * when the target string is UTF-8 (<depends_list>). */
14892     if (depends_list) {
14893         if (cp_list) {
14894             _invlist_union(cp_list, depends_list, &cp_list);
14895             SvREFCNT_dec_NN(depends_list);
14896         }
14897         else {
14898             cp_list = depends_list;
14899         }
14900         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14901     }
14902
14903     /* If there is a swash and more than one element, we can't use the swash in
14904      * the optimization below. */
14905     if (swash && element_count > 1) {
14906         SvREFCNT_dec_NN(swash);
14907         swash = NULL;
14908     }
14909
14910     set_ANYOF_arg(pRExC_state, ret, cp_list,
14911                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14912                    ? listsv : NULL,
14913                   only_utf8_locale_list,
14914                   swash, has_user_defined_property);
14915
14916     *flagp |= HASWIDTH|SIMPLE;
14917
14918     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14919         RExC_contains_locale = 1;
14920     }
14921
14922     return ret;
14923 }
14924
14925 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14926
14927 STATIC void
14928 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14929                 regnode* const node,
14930                 SV* const cp_list,
14931                 SV* const runtime_defns,
14932                 SV* const only_utf8_locale_list,
14933                 SV* const swash,
14934                 const bool has_user_defined_property)
14935 {
14936     /* Sets the arg field of an ANYOF-type node 'node', using information about
14937      * the node passed-in.  If there is nothing outside the node's bitmap, the
14938      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14939      * the count returned by add_data(), having allocated and stored an array,
14940      * av, that that count references, as follows:
14941      *  av[0] stores the character class description in its textual form.
14942      *        This is used later (regexec.c:Perl_regclass_swash()) to
14943      *        initialize the appropriate swash, and is also useful for dumping
14944      *        the regnode.  This is set to &PL_sv_undef if the textual
14945      *        description is not needed at run-time (as happens if the other
14946      *        elements completely define the class)
14947      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14948      *        computed from av[0].  But if no further computation need be done,
14949      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14950      *  av[2] stores the inversion list of code points that match only if the
14951      *        current locale is UTF-8
14952      *  av[3] stores the cp_list inversion list for use in addition or instead
14953      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14954      *        (Otherwise everything needed is already in av[0] and av[1])
14955      *  av[4] is set if any component of the class is from a user-defined
14956      *        property; used only if av[3] exists */
14957
14958     UV n;
14959
14960     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14961
14962     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14963         assert(! (ANYOF_FLAGS(node)
14964                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14965         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14966     }
14967     else {
14968         AV * const av = newAV();
14969         SV *rv;
14970
14971         assert(ANYOF_FLAGS(node)
14972                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14973
14974         av_store(av, 0, (runtime_defns)
14975                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14976         if (swash) {
14977             assert(cp_list);
14978             av_store(av, 1, swash);
14979             SvREFCNT_dec_NN(cp_list);
14980         }
14981         else {
14982             av_store(av, 1, &PL_sv_undef);
14983             if (cp_list) {
14984                 av_store(av, 3, cp_list);
14985                 av_store(av, 4, newSVuv(has_user_defined_property));
14986             }
14987         }
14988
14989         if (only_utf8_locale_list) {
14990             av_store(av, 2, only_utf8_locale_list);
14991         }
14992         else {
14993             av_store(av, 2, &PL_sv_undef);
14994         }
14995
14996         rv = newRV_noinc(MUTABLE_SV(av));
14997         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14998         RExC_rxi->data->data[n] = (void*)rv;
14999         ARG_SET(node, n);
15000     }
15001 }
15002
15003
15004 /* reg_skipcomment()
15005
15006    Absorbs an /x style # comment from the input stream,
15007    returning a pointer to the first character beyond the comment, or if the
15008    comment terminates the pattern without anything following it, this returns
15009    one past the final character of the pattern (in other words, RExC_end) and
15010    sets the REG_RUN_ON_COMMENT_SEEN flag.
15011
15012    Note it's the callers responsibility to ensure that we are
15013    actually in /x mode
15014
15015 */
15016
15017 PERL_STATIC_INLINE char*
15018 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15019 {
15020     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15021
15022     assert(*p == '#');
15023
15024     while (p < RExC_end) {
15025         if (*(++p) == '\n') {
15026             return p+1;
15027         }
15028     }
15029
15030     /* we ran off the end of the pattern without ending the comment, so we have
15031      * to add an \n when wrapping */
15032     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15033     return p;
15034 }
15035
15036 /* nextchar()
15037
15038    Advances the parse position, and optionally absorbs
15039    "whitespace" from the inputstream.
15040
15041    Without /x "whitespace" means (?#...) style comments only,
15042    with /x this means (?#...) and # comments and whitespace proper.
15043
15044    Returns the RExC_parse point from BEFORE the scan occurs.
15045
15046    This is the /x friendly way of saying RExC_parse++.
15047 */
15048
15049 STATIC char*
15050 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15051 {
15052     char* const retval = RExC_parse++;
15053
15054     PERL_ARGS_ASSERT_NEXTCHAR;
15055
15056     for (;;) {
15057         if (RExC_end - RExC_parse >= 3
15058             && *RExC_parse == '('
15059             && RExC_parse[1] == '?'
15060             && RExC_parse[2] == '#')
15061         {
15062             while (*RExC_parse != ')') {
15063                 if (RExC_parse == RExC_end)
15064                     FAIL("Sequence (?#... not terminated");
15065                 RExC_parse++;
15066             }
15067             RExC_parse++;
15068             continue;
15069         }
15070         if (RExC_flags & RXf_PMf_EXTENDED) {
15071             char * p = regpatws(pRExC_state, RExC_parse,
15072                                           TRUE); /* means recognize comments */
15073             if (p != RExC_parse) {
15074                 RExC_parse = p;
15075                 continue;
15076             }
15077         }
15078         return retval;
15079     }
15080 }
15081
15082 /*
15083 - reg_node - emit a node
15084 */
15085 STATIC regnode *                        /* Location. */
15086 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15087 {
15088     regnode *ptr;
15089     regnode * const ret = RExC_emit;
15090     GET_RE_DEBUG_FLAGS_DECL;
15091
15092     PERL_ARGS_ASSERT_REG_NODE;
15093
15094     if (SIZE_ONLY) {
15095         SIZE_ALIGN(RExC_size);
15096         RExC_size += 1;
15097         return(ret);
15098     }
15099     if (RExC_emit >= RExC_emit_bound)
15100         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15101                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15102
15103     NODE_ALIGN_FILL(ret);
15104     ptr = ret;
15105     FILL_ADVANCE_NODE(ptr, op);
15106 #ifdef RE_TRACK_PATTERN_OFFSETS
15107     if (RExC_offsets) {         /* MJD */
15108         MJD_OFFSET_DEBUG(
15109               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15110               "reg_node", __LINE__,
15111               PL_reg_name[op],
15112               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15113                 ? "Overwriting end of array!\n" : "OK",
15114               (UV)(RExC_emit - RExC_emit_start),
15115               (UV)(RExC_parse - RExC_start),
15116               (UV)RExC_offsets[0]));
15117         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15118     }
15119 #endif
15120     RExC_emit = ptr;
15121     return(ret);
15122 }
15123
15124 /*
15125 - reganode - emit a node with an argument
15126 */
15127 STATIC regnode *                        /* Location. */
15128 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15129 {
15130     regnode *ptr;
15131     regnode * const ret = RExC_emit;
15132     GET_RE_DEBUG_FLAGS_DECL;
15133
15134     PERL_ARGS_ASSERT_REGANODE;
15135
15136     if (SIZE_ONLY) {
15137         SIZE_ALIGN(RExC_size);
15138         RExC_size += 2;
15139         /*
15140            We can't do this:
15141
15142            assert(2==regarglen[op]+1);
15143
15144            Anything larger than this has to allocate the extra amount.
15145            If we changed this to be:
15146
15147            RExC_size += (1 + regarglen[op]);
15148
15149            then it wouldn't matter. Its not clear what side effect
15150            might come from that so its not done so far.
15151            -- dmq
15152         */
15153         return(ret);
15154     }
15155     if (RExC_emit >= RExC_emit_bound)
15156         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15157                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15158
15159     NODE_ALIGN_FILL(ret);
15160     ptr = ret;
15161     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15162 #ifdef RE_TRACK_PATTERN_OFFSETS
15163     if (RExC_offsets) {         /* MJD */
15164         MJD_OFFSET_DEBUG(
15165               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15166               "reganode",
15167               __LINE__,
15168               PL_reg_name[op],
15169               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15170               "Overwriting end of array!\n" : "OK",
15171               (UV)(RExC_emit - RExC_emit_start),
15172               (UV)(RExC_parse - RExC_start),
15173               (UV)RExC_offsets[0]));
15174         Set_Cur_Node_Offset;
15175     }
15176 #endif
15177     RExC_emit = ptr;
15178     return(ret);
15179 }
15180
15181 /*
15182 - reguni - emit (if appropriate) a Unicode character
15183 */
15184 PERL_STATIC_INLINE STRLEN
15185 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15186 {
15187     PERL_ARGS_ASSERT_REGUNI;
15188
15189     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15190 }
15191
15192 /*
15193 - reginsert - insert an operator in front of already-emitted operand
15194 *
15195 * Means relocating the operand.
15196 */
15197 STATIC void
15198 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15199 {
15200     regnode *src;
15201     regnode *dst;
15202     regnode *place;
15203     const int offset = regarglen[(U8)op];
15204     const int size = NODE_STEP_REGNODE + offset;
15205     GET_RE_DEBUG_FLAGS_DECL;
15206
15207     PERL_ARGS_ASSERT_REGINSERT;
15208     PERL_UNUSED_CONTEXT;
15209     PERL_UNUSED_ARG(depth);
15210 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15211     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15212     if (SIZE_ONLY) {
15213         RExC_size += size;
15214         return;
15215     }
15216
15217     src = RExC_emit;
15218     RExC_emit += size;
15219     dst = RExC_emit;
15220     if (RExC_open_parens) {
15221         int paren;
15222         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15223         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15224             if ( RExC_open_parens[paren] >= opnd ) {
15225                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15226                 RExC_open_parens[paren] += size;
15227             } else {
15228                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15229             }
15230             if ( RExC_close_parens[paren] >= opnd ) {
15231                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15232                 RExC_close_parens[paren] += size;
15233             } else {
15234                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15235             }
15236         }
15237     }
15238
15239     while (src > opnd) {
15240         StructCopy(--src, --dst, regnode);
15241 #ifdef RE_TRACK_PATTERN_OFFSETS
15242         if (RExC_offsets) {     /* MJD 20010112 */
15243             MJD_OFFSET_DEBUG(
15244                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15245                   "reg_insert",
15246                   __LINE__,
15247                   PL_reg_name[op],
15248                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15249                     ? "Overwriting end of array!\n" : "OK",
15250                   (UV)(src - RExC_emit_start),
15251                   (UV)(dst - RExC_emit_start),
15252                   (UV)RExC_offsets[0]));
15253             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15254             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15255         }
15256 #endif
15257     }
15258
15259
15260     place = opnd;               /* Op node, where operand used to be. */
15261 #ifdef RE_TRACK_PATTERN_OFFSETS
15262     if (RExC_offsets) {         /* MJD */
15263         MJD_OFFSET_DEBUG(
15264               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15265               "reginsert",
15266               __LINE__,
15267               PL_reg_name[op],
15268               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15269               ? "Overwriting end of array!\n" : "OK",
15270               (UV)(place - RExC_emit_start),
15271               (UV)(RExC_parse - RExC_start),
15272               (UV)RExC_offsets[0]));
15273         Set_Node_Offset(place, RExC_parse);
15274         Set_Node_Length(place, 1);
15275     }
15276 #endif
15277     src = NEXTOPER(place);
15278     FILL_ADVANCE_NODE(place, op);
15279     Zero(src, offset, regnode);
15280 }
15281
15282 /*
15283 - regtail - set the next-pointer at the end of a node chain of p to val.
15284 - SEE ALSO: regtail_study
15285 */
15286 /* TODO: All three parms should be const */
15287 STATIC void
15288 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15289                 const regnode *val,U32 depth)
15290 {
15291     regnode *scan;
15292     GET_RE_DEBUG_FLAGS_DECL;
15293
15294     PERL_ARGS_ASSERT_REGTAIL;
15295 #ifndef DEBUGGING
15296     PERL_UNUSED_ARG(depth);
15297 #endif
15298
15299     if (SIZE_ONLY)
15300         return;
15301
15302     /* Find last node. */
15303     scan = p;
15304     for (;;) {
15305         regnode * const temp = regnext(scan);
15306         DEBUG_PARSE_r({
15307             SV * const mysv=sv_newmortal();
15308             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15309             regprop(RExC_rx, mysv, scan, NULL);
15310             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15311                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15312                     (temp == NULL ? "->" : ""),
15313                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15314             );
15315         });
15316         if (temp == NULL)
15317             break;
15318         scan = temp;
15319     }
15320
15321     if (reg_off_by_arg[OP(scan)]) {
15322         ARG_SET(scan, val - scan);
15323     }
15324     else {
15325         NEXT_OFF(scan) = val - scan;
15326     }
15327 }
15328
15329 #ifdef DEBUGGING
15330 /*
15331 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15332 - Look for optimizable sequences at the same time.
15333 - currently only looks for EXACT chains.
15334
15335 This is experimental code. The idea is to use this routine to perform
15336 in place optimizations on branches and groups as they are constructed,
15337 with the long term intention of removing optimization from study_chunk so
15338 that it is purely analytical.
15339
15340 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15341 to control which is which.
15342
15343 */
15344 /* TODO: All four parms should be const */
15345
15346 STATIC U8
15347 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15348                       const regnode *val,U32 depth)
15349 {
15350     dVAR;
15351     regnode *scan;
15352     U8 exact = PSEUDO;
15353 #ifdef EXPERIMENTAL_INPLACESCAN
15354     I32 min = 0;
15355 #endif
15356     GET_RE_DEBUG_FLAGS_DECL;
15357
15358     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15359
15360
15361     if (SIZE_ONLY)
15362         return exact;
15363
15364     /* Find last node. */
15365
15366     scan = p;
15367     for (;;) {
15368         regnode * const temp = regnext(scan);
15369 #ifdef EXPERIMENTAL_INPLACESCAN
15370         if (PL_regkind[OP(scan)] == EXACT) {
15371             bool unfolded_multi_char;   /* Unexamined in this routine */
15372             if (join_exact(pRExC_state, scan, &min,
15373                            &unfolded_multi_char, 1, val, depth+1))
15374                 return EXACT;
15375         }
15376 #endif
15377         if ( exact ) {
15378             switch (OP(scan)) {
15379                 case EXACT:
15380                 case EXACTF:
15381                 case EXACTFA_NO_TRIE:
15382                 case EXACTFA:
15383                 case EXACTFU:
15384                 case EXACTFU_SS:
15385                 case EXACTFL:
15386                         if( exact == PSEUDO )
15387                             exact= OP(scan);
15388                         else if ( exact != OP(scan) )
15389                             exact= 0;
15390                 case NOTHING:
15391                     break;
15392                 default:
15393                     exact= 0;
15394             }
15395         }
15396         DEBUG_PARSE_r({
15397             SV * const mysv=sv_newmortal();
15398             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15399             regprop(RExC_rx, mysv, scan, NULL);
15400             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15401                 SvPV_nolen_const(mysv),
15402                 REG_NODE_NUM(scan),
15403                 PL_reg_name[exact]);
15404         });
15405         if (temp == NULL)
15406             break;
15407         scan = temp;
15408     }
15409     DEBUG_PARSE_r({
15410         SV * const mysv_val=sv_newmortal();
15411         DEBUG_PARSE_MSG("");
15412         regprop(RExC_rx, mysv_val, val, NULL);
15413         PerlIO_printf(Perl_debug_log,
15414                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15415                       SvPV_nolen_const(mysv_val),
15416                       (IV)REG_NODE_NUM(val),
15417                       (IV)(val - scan)
15418         );
15419     });
15420     if (reg_off_by_arg[OP(scan)]) {
15421         ARG_SET(scan, val - scan);
15422     }
15423     else {
15424         NEXT_OFF(scan) = val - scan;
15425     }
15426
15427     return exact;
15428 }
15429 #endif
15430
15431 /*
15432  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15433  */
15434 #ifdef DEBUGGING
15435
15436 static void
15437 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15438 {
15439     int bit;
15440     int set=0;
15441
15442     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15443
15444     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15445         if (flags & (1<<bit)) {
15446             if (!set++ && lead)
15447                 PerlIO_printf(Perl_debug_log, "%s",lead);
15448             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15449         }
15450     }
15451     if (lead)  {
15452         if (set)
15453             PerlIO_printf(Perl_debug_log, "\n");
15454         else
15455             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15456     }
15457 }
15458
15459 static void
15460 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15461 {
15462     int bit;
15463     int set=0;
15464     regex_charset cs;
15465
15466     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15467
15468     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15469         if (flags & (1<<bit)) {
15470             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15471                 continue;
15472             }
15473             if (!set++ && lead)
15474                 PerlIO_printf(Perl_debug_log, "%s",lead);
15475             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15476         }
15477     }
15478     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15479             if (!set++ && lead) {
15480                 PerlIO_printf(Perl_debug_log, "%s",lead);
15481             }
15482             switch (cs) {
15483                 case REGEX_UNICODE_CHARSET:
15484                     PerlIO_printf(Perl_debug_log, "UNICODE");
15485                     break;
15486                 case REGEX_LOCALE_CHARSET:
15487                     PerlIO_printf(Perl_debug_log, "LOCALE");
15488                     break;
15489                 case REGEX_ASCII_RESTRICTED_CHARSET:
15490                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15491                     break;
15492                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15493                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15494                     break;
15495                 default:
15496                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15497                     break;
15498             }
15499     }
15500     if (lead)  {
15501         if (set)
15502             PerlIO_printf(Perl_debug_log, "\n");
15503         else
15504             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15505     }
15506 }
15507 #endif
15508
15509 void
15510 Perl_regdump(pTHX_ const regexp *r)
15511 {
15512 #ifdef DEBUGGING
15513     dVAR;
15514     SV * const sv = sv_newmortal();
15515     SV *dsv= sv_newmortal();
15516     RXi_GET_DECL(r,ri);
15517     GET_RE_DEBUG_FLAGS_DECL;
15518
15519     PERL_ARGS_ASSERT_REGDUMP;
15520
15521     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15522
15523     /* Header fields of interest. */
15524     if (r->anchored_substr) {
15525         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15526             RE_SV_DUMPLEN(r->anchored_substr), 30);
15527         PerlIO_printf(Perl_debug_log,
15528                       "anchored %s%s at %"IVdf" ",
15529                       s, RE_SV_TAIL(r->anchored_substr),
15530                       (IV)r->anchored_offset);
15531     } else if (r->anchored_utf8) {
15532         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15533             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15534         PerlIO_printf(Perl_debug_log,
15535                       "anchored utf8 %s%s at %"IVdf" ",
15536                       s, RE_SV_TAIL(r->anchored_utf8),
15537                       (IV)r->anchored_offset);
15538     }
15539     if (r->float_substr) {
15540         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15541             RE_SV_DUMPLEN(r->float_substr), 30);
15542         PerlIO_printf(Perl_debug_log,
15543                       "floating %s%s at %"IVdf"..%"UVuf" ",
15544                       s, RE_SV_TAIL(r->float_substr),
15545                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15546     } else if (r->float_utf8) {
15547         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15548             RE_SV_DUMPLEN(r->float_utf8), 30);
15549         PerlIO_printf(Perl_debug_log,
15550                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15551                       s, RE_SV_TAIL(r->float_utf8),
15552                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15553     }
15554     if (r->check_substr || r->check_utf8)
15555         PerlIO_printf(Perl_debug_log,
15556                       (const char *)
15557                       (r->check_substr == r->float_substr
15558                        && r->check_utf8 == r->float_utf8
15559                        ? "(checking floating" : "(checking anchored"));
15560     if (r->intflags & PREGf_NOSCAN)
15561         PerlIO_printf(Perl_debug_log, " noscan");
15562     if (r->extflags & RXf_CHECK_ALL)
15563         PerlIO_printf(Perl_debug_log, " isall");
15564     if (r->check_substr || r->check_utf8)
15565         PerlIO_printf(Perl_debug_log, ") ");
15566
15567     if (ri->regstclass) {
15568         regprop(r, sv, ri->regstclass, NULL);
15569         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15570     }
15571     if (r->intflags & PREGf_ANCH) {
15572         PerlIO_printf(Perl_debug_log, "anchored");
15573         if (r->intflags & PREGf_ANCH_BOL)
15574             PerlIO_printf(Perl_debug_log, "(BOL)");
15575         if (r->intflags & PREGf_ANCH_MBOL)
15576             PerlIO_printf(Perl_debug_log, "(MBOL)");
15577         if (r->intflags & PREGf_ANCH_SBOL)
15578             PerlIO_printf(Perl_debug_log, "(SBOL)");
15579         if (r->intflags & PREGf_ANCH_GPOS)
15580             PerlIO_printf(Perl_debug_log, "(GPOS)");
15581         PerlIO_putc(Perl_debug_log, ' ');
15582     }
15583     if (r->intflags & PREGf_GPOS_SEEN)
15584         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15585     if (r->intflags & PREGf_SKIP)
15586         PerlIO_printf(Perl_debug_log, "plus ");
15587     if (r->intflags & PREGf_IMPLICIT)
15588         PerlIO_printf(Perl_debug_log, "implicit ");
15589     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15590     if (r->extflags & RXf_EVAL_SEEN)
15591         PerlIO_printf(Perl_debug_log, "with eval ");
15592     PerlIO_printf(Perl_debug_log, "\n");
15593     DEBUG_FLAGS_r({
15594         regdump_extflags("r->extflags: ",r->extflags);
15595         regdump_intflags("r->intflags: ",r->intflags);
15596     });
15597 #else
15598     PERL_ARGS_ASSERT_REGDUMP;
15599     PERL_UNUSED_CONTEXT;
15600     PERL_UNUSED_ARG(r);
15601 #endif  /* DEBUGGING */
15602 }
15603
15604 /*
15605 - regprop - printable representation of opcode, with run time support
15606 */
15607
15608 void
15609 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15610 {
15611 #ifdef DEBUGGING
15612     dVAR;
15613     int k;
15614
15615     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15616     static const char * const anyofs[] = {
15617 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15618     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15619     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15620     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15621     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15622     || _CC_VERTSPACE != 16
15623   #error Need to adjust order of anyofs[]
15624 #endif
15625         "\\w",
15626         "\\W",
15627         "\\d",
15628         "\\D",
15629         "[:alpha:]",
15630         "[:^alpha:]",
15631         "[:lower:]",
15632         "[:^lower:]",
15633         "[:upper:]",
15634         "[:^upper:]",
15635         "[:punct:]",
15636         "[:^punct:]",
15637         "[:print:]",
15638         "[:^print:]",
15639         "[:alnum:]",
15640         "[:^alnum:]",
15641         "[:graph:]",
15642         "[:^graph:]",
15643         "[:cased:]",
15644         "[:^cased:]",
15645         "\\s",
15646         "\\S",
15647         "[:blank:]",
15648         "[:^blank:]",
15649         "[:xdigit:]",
15650         "[:^xdigit:]",
15651         "[:space:]",
15652         "[:^space:]",
15653         "[:cntrl:]",
15654         "[:^cntrl:]",
15655         "[:ascii:]",
15656         "[:^ascii:]",
15657         "\\v",
15658         "\\V"
15659     };
15660     RXi_GET_DECL(prog,progi);
15661     GET_RE_DEBUG_FLAGS_DECL;
15662
15663     PERL_ARGS_ASSERT_REGPROP;
15664
15665     sv_setpvs(sv, "");
15666
15667     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15668         /* It would be nice to FAIL() here, but this may be called from
15669            regexec.c, and it would be hard to supply pRExC_state. */
15670         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15671                                               (int)OP(o), (int)REGNODE_MAX);
15672     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15673
15674     k = PL_regkind[OP(o)];
15675
15676     if (k == EXACT) {
15677         sv_catpvs(sv, " ");
15678         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15679          * is a crude hack but it may be the best for now since
15680          * we have no flag "this EXACTish node was UTF-8"
15681          * --jhi */
15682         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15683                   PERL_PV_ESCAPE_UNI_DETECT |
15684                   PERL_PV_ESCAPE_NONASCII   |
15685                   PERL_PV_PRETTY_ELLIPSES   |
15686                   PERL_PV_PRETTY_LTGT       |
15687                   PERL_PV_PRETTY_NOCLEAR
15688                   );
15689     } else if (k == TRIE) {
15690         /* print the details of the trie in dumpuntil instead, as
15691          * progi->data isn't available here */
15692         const char op = OP(o);
15693         const U32 n = ARG(o);
15694         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15695                (reg_ac_data *)progi->data->data[n] :
15696                NULL;
15697         const reg_trie_data * const trie
15698             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15699
15700         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15701         DEBUG_TRIE_COMPILE_r(
15702           Perl_sv_catpvf(aTHX_ sv,
15703             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15704             (UV)trie->startstate,
15705             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15706             (UV)trie->wordcount,
15707             (UV)trie->minlen,
15708             (UV)trie->maxlen,
15709             (UV)TRIE_CHARCOUNT(trie),
15710             (UV)trie->uniquecharcount
15711           );
15712         );
15713         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15714             sv_catpvs(sv, "[");
15715             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15716                                                    ? ANYOF_BITMAP(o)
15717                                                    : TRIE_BITMAP(trie));
15718             sv_catpvs(sv, "]");
15719         }
15720
15721     } else if (k == CURLY) {
15722         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15723             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15724         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15725     }
15726     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15727         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15728     else if (k == REF || k == OPEN || k == CLOSE
15729              || k == GROUPP || OP(o)==ACCEPT)
15730     {
15731         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15732         if ( RXp_PAREN_NAMES(prog) ) {
15733             if ( k != REF || (OP(o) < NREF)) {
15734                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15735                 SV **name= av_fetch(list, ARG(o), 0 );
15736                 if (name)
15737                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15738             }
15739             else {
15740                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15741                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15742                 I32 *nums=(I32*)SvPVX(sv_dat);
15743                 SV **name= av_fetch(list, nums[0], 0 );
15744                 I32 n;
15745                 if (name) {
15746                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15747                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15748                                     (n ? "," : ""), (IV)nums[n]);
15749                     }
15750                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15751                 }
15752             }
15753         }
15754         if ( k == REF && reginfo) {
15755             U32 n = ARG(o);  /* which paren pair */
15756             I32 ln = prog->offs[n].start;
15757             if (prog->lastparen < n || ln == -1)
15758                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15759             else if (ln == prog->offs[n].end)
15760                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15761             else {
15762                 const char *s = reginfo->strbeg + ln;
15763                 Perl_sv_catpvf(aTHX_ sv, ": ");
15764                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15765                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15766             }
15767         }
15768     } else if (k == GOSUB)
15769         /* Paren and offset */
15770         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15771     else if (k == VERB) {
15772         if (!o->flags)
15773             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15774                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15775     } else if (k == LOGICAL)
15776         /* 2: embedded, otherwise 1 */
15777         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15778     else if (k == ANYOF) {
15779         const U8 flags = ANYOF_FLAGS(o);
15780         int do_sep = 0;
15781
15782
15783         if (flags & ANYOF_LOCALE_FLAGS)
15784             sv_catpvs(sv, "{loc}");
15785         if (flags & ANYOF_LOC_FOLD)
15786             sv_catpvs(sv, "{i}");
15787         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15788         if (flags & ANYOF_INVERT)
15789             sv_catpvs(sv, "^");
15790
15791         /* output what the standard cp 0-255 bitmap matches */
15792         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15793
15794         /* output any special charclass tests (used entirely under use
15795          * locale) * */
15796         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15797             int i;
15798             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15799                 if (ANYOF_POSIXL_TEST(o,i)) {
15800                     sv_catpv(sv, anyofs[i]);
15801                     do_sep = 1;
15802                 }
15803             }
15804         }
15805
15806         if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15807                       |ANYOF_UTF8
15808                       |ANYOF_NONBITMAP_NON_UTF8
15809                       |ANYOF_LOC_FOLD)))
15810         {
15811             if (do_sep) {
15812                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15813                 if (flags & ANYOF_INVERT)
15814                     /*make sure the invert info is in each */
15815                     sv_catpvs(sv, "^");
15816             }
15817
15818             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15819                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15820             }
15821
15822             /* output information about the unicode matching */
15823             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15824                 sv_catpvs(sv, "{unicode_all}");
15825             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15826                 SV *lv; /* Set if there is something outside the bit map. */
15827                 bool byte_output = FALSE;   /* If something in the bitmap has
15828                                                been output */
15829                 SV *only_utf8_locale;
15830
15831                 /* Get the stuff that wasn't in the bitmap */
15832                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15833                                                     &lv, &only_utf8_locale);
15834                 if (lv && lv != &PL_sv_undef) {
15835                     char *s = savesvpv(lv);
15836                     char * const origs = s;
15837
15838                     while (*s && *s != '\n')
15839                         s++;
15840
15841                     if (*s == '\n') {
15842                         const char * const t = ++s;
15843
15844                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15845                             sv_catpvs(sv, "{outside bitmap}");
15846                         }
15847                         else {
15848                             sv_catpvs(sv, "{utf8}");
15849                         }
15850
15851                         if (byte_output) {
15852                             sv_catpvs(sv, " ");
15853                         }
15854
15855                         while (*s) {
15856                             if (*s == '\n') {
15857
15858                                 /* Truncate very long output */
15859                                 if (s - origs > 256) {
15860                                     Perl_sv_catpvf(aTHX_ sv,
15861                                                 "%.*s...",
15862                                                 (int) (s - origs - 1),
15863                                                 t);
15864                                     goto out_dump;
15865                                 }
15866                                 *s = ' ';
15867                             }
15868                             else if (*s == '\t') {
15869                                 *s = '-';
15870                             }
15871                             s++;
15872                         }
15873                         if (s[-1] == ' ')
15874                             s[-1] = 0;
15875
15876                         sv_catpv(sv, t);
15877                     }
15878
15879                 out_dump:
15880
15881                     Safefree(origs);
15882                     SvREFCNT_dec_NN(lv);
15883                 }
15884
15885                 if ((flags & ANYOF_LOC_FOLD)
15886                      && only_utf8_locale
15887                      && only_utf8_locale != &PL_sv_undef)
15888                 {
15889                     UV start, end;
15890                     int max_entries = 256;
15891
15892                     sv_catpvs(sv, "{utf8 locale}");
15893                     invlist_iterinit(only_utf8_locale);
15894                     while (invlist_iternext(only_utf8_locale,
15895                                             &start, &end)) {
15896                         put_range(sv, start, end);
15897                         max_entries --;
15898                         if (max_entries < 0) {
15899                             sv_catpvs(sv, "...");
15900                             break;
15901                         }
15902                     }
15903                     invlist_iterfinish(only_utf8_locale);
15904                 }
15905             }
15906         }
15907
15908         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15909     }
15910     else if (k == POSIXD || k == NPOSIXD) {
15911         U8 index = FLAGS(o) * 2;
15912         if (index < C_ARRAY_LENGTH(anyofs)) {
15913             if (*anyofs[index] != '[')  {
15914                 sv_catpv(sv, "[");
15915             }
15916             sv_catpv(sv, anyofs[index]);
15917             if (*anyofs[index] != '[')  {
15918                 sv_catpv(sv, "]");
15919             }
15920         }
15921         else {
15922             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15923         }
15924     }
15925     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15926         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15927 #else
15928     PERL_UNUSED_CONTEXT;
15929     PERL_UNUSED_ARG(sv);
15930     PERL_UNUSED_ARG(o);
15931     PERL_UNUSED_ARG(prog);
15932     PERL_UNUSED_ARG(reginfo);
15933 #endif  /* DEBUGGING */
15934 }
15935
15936
15937
15938 SV *
15939 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15940 {                               /* Assume that RE_INTUIT is set */
15941     struct regexp *const prog = ReANY(r);
15942     GET_RE_DEBUG_FLAGS_DECL;
15943
15944     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15945     PERL_UNUSED_CONTEXT;
15946
15947     DEBUG_COMPILE_r(
15948         {
15949             const char * const s = SvPV_nolen_const(prog->check_substr
15950                       ? prog->check_substr : prog->check_utf8);
15951
15952             if (!PL_colorset) reginitcolors();
15953             PerlIO_printf(Perl_debug_log,
15954                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15955                       PL_colors[4],
15956                       prog->check_substr ? "" : "utf8 ",
15957                       PL_colors[5],PL_colors[0],
15958                       s,
15959                       PL_colors[1],
15960                       (strlen(s) > 60 ? "..." : ""));
15961         } );
15962
15963     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15964 }
15965
15966 /*
15967    pregfree()
15968
15969    handles refcounting and freeing the perl core regexp structure. When
15970    it is necessary to actually free the structure the first thing it
15971    does is call the 'free' method of the regexp_engine associated to
15972    the regexp, allowing the handling of the void *pprivate; member
15973    first. (This routine is not overridable by extensions, which is why
15974    the extensions free is called first.)
15975
15976    See regdupe and regdupe_internal if you change anything here.
15977 */
15978 #ifndef PERL_IN_XSUB_RE
15979 void
15980 Perl_pregfree(pTHX_ REGEXP *r)
15981 {
15982     SvREFCNT_dec(r);
15983 }
15984
15985 void
15986 Perl_pregfree2(pTHX_ REGEXP *rx)
15987 {
15988     struct regexp *const r = ReANY(rx);
15989     GET_RE_DEBUG_FLAGS_DECL;
15990
15991     PERL_ARGS_ASSERT_PREGFREE2;
15992
15993     if (r->mother_re) {
15994         ReREFCNT_dec(r->mother_re);
15995     } else {
15996         CALLREGFREE_PVT(rx); /* free the private data */
15997         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15998         Safefree(r->xpv_len_u.xpvlenu_pv);
15999     }
16000     if (r->substrs) {
16001         SvREFCNT_dec(r->anchored_substr);
16002         SvREFCNT_dec(r->anchored_utf8);
16003         SvREFCNT_dec(r->float_substr);
16004         SvREFCNT_dec(r->float_utf8);
16005         Safefree(r->substrs);
16006     }
16007     RX_MATCH_COPY_FREE(rx);
16008 #ifdef PERL_ANY_COW
16009     SvREFCNT_dec(r->saved_copy);
16010 #endif
16011     Safefree(r->offs);
16012     SvREFCNT_dec(r->qr_anoncv);
16013     rx->sv_u.svu_rx = 0;
16014 }
16015
16016 /*  reg_temp_copy()
16017
16018     This is a hacky workaround to the structural issue of match results
16019     being stored in the regexp structure which is in turn stored in
16020     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16021     could be PL_curpm in multiple contexts, and could require multiple
16022     result sets being associated with the pattern simultaneously, such
16023     as when doing a recursive match with (??{$qr})
16024
16025     The solution is to make a lightweight copy of the regexp structure
16026     when a qr// is returned from the code executed by (??{$qr}) this
16027     lightweight copy doesn't actually own any of its data except for
16028     the starp/end and the actual regexp structure itself.
16029
16030 */
16031
16032
16033 REGEXP *
16034 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16035 {
16036     struct regexp *ret;
16037     struct regexp *const r = ReANY(rx);
16038     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16039
16040     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16041
16042     if (!ret_x)
16043         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16044     else {
16045         SvOK_off((SV *)ret_x);
16046         if (islv) {
16047             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16048                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16049                made both spots point to the same regexp body.) */
16050             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16051             assert(!SvPVX(ret_x));
16052             ret_x->sv_u.svu_rx = temp->sv_any;
16053             temp->sv_any = NULL;
16054             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16055             SvREFCNT_dec_NN(temp);
16056             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16057                ing below will not set it. */
16058             SvCUR_set(ret_x, SvCUR(rx));
16059         }
16060     }
16061     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16062        sv_force_normal(sv) is called.  */
16063     SvFAKE_on(ret_x);
16064     ret = ReANY(ret_x);
16065
16066     SvFLAGS(ret_x) |= SvUTF8(rx);
16067     /* We share the same string buffer as the original regexp, on which we
16068        hold a reference count, incremented when mother_re is set below.
16069        The string pointer is copied here, being part of the regexp struct.
16070      */
16071     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16072            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16073     if (r->offs) {
16074         const I32 npar = r->nparens+1;
16075         Newx(ret->offs, npar, regexp_paren_pair);
16076         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16077     }
16078     if (r->substrs) {
16079         Newx(ret->substrs, 1, struct reg_substr_data);
16080         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16081
16082         SvREFCNT_inc_void(ret->anchored_substr);
16083         SvREFCNT_inc_void(ret->anchored_utf8);
16084         SvREFCNT_inc_void(ret->float_substr);
16085         SvREFCNT_inc_void(ret->float_utf8);
16086
16087         /* check_substr and check_utf8, if non-NULL, point to either their
16088            anchored or float namesakes, and don't hold a second reference.  */
16089     }
16090     RX_MATCH_COPIED_off(ret_x);
16091 #ifdef PERL_ANY_COW
16092     ret->saved_copy = NULL;
16093 #endif
16094     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16095     SvREFCNT_inc_void(ret->qr_anoncv);
16096
16097     return ret_x;
16098 }
16099 #endif
16100
16101 /* regfree_internal()
16102
16103    Free the private data in a regexp. This is overloadable by
16104    extensions. Perl takes care of the regexp structure in pregfree(),
16105    this covers the *pprivate pointer which technically perl doesn't
16106    know about, however of course we have to handle the
16107    regexp_internal structure when no extension is in use.
16108
16109    Note this is called before freeing anything in the regexp
16110    structure.
16111  */
16112
16113 void
16114 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16115 {
16116     struct regexp *const r = ReANY(rx);
16117     RXi_GET_DECL(r,ri);
16118     GET_RE_DEBUG_FLAGS_DECL;
16119
16120     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16121
16122     DEBUG_COMPILE_r({
16123         if (!PL_colorset)
16124             reginitcolors();
16125         {
16126             SV *dsv= sv_newmortal();
16127             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16128                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16129             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16130                 PL_colors[4],PL_colors[5],s);
16131         }
16132     });
16133 #ifdef RE_TRACK_PATTERN_OFFSETS
16134     if (ri->u.offsets)
16135         Safefree(ri->u.offsets);             /* 20010421 MJD */
16136 #endif
16137     if (ri->code_blocks) {
16138         int n;
16139         for (n = 0; n < ri->num_code_blocks; n++)
16140             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16141         Safefree(ri->code_blocks);
16142     }
16143
16144     if (ri->data) {
16145         int n = ri->data->count;
16146
16147         while (--n >= 0) {
16148           /* If you add a ->what type here, update the comment in regcomp.h */
16149             switch (ri->data->what[n]) {
16150             case 'a':
16151             case 'r':
16152             case 's':
16153             case 'S':
16154             case 'u':
16155                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16156                 break;
16157             case 'f':
16158                 Safefree(ri->data->data[n]);
16159                 break;
16160             case 'l':
16161             case 'L':
16162                 break;
16163             case 'T':
16164                 { /* Aho Corasick add-on structure for a trie node.
16165                      Used in stclass optimization only */
16166                     U32 refcount;
16167                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16168 #ifdef USE_ITHREADS
16169                     dVAR;
16170 #endif
16171                     OP_REFCNT_LOCK;
16172                     refcount = --aho->refcount;
16173                     OP_REFCNT_UNLOCK;
16174                     if ( !refcount ) {
16175                         PerlMemShared_free(aho->states);
16176                         PerlMemShared_free(aho->fail);
16177                          /* do this last!!!! */
16178                         PerlMemShared_free(ri->data->data[n]);
16179                         /* we should only ever get called once, so
16180                          * assert as much, and also guard the free
16181                          * which /might/ happen twice. At the least
16182                          * it will make code anlyzers happy and it
16183                          * doesn't cost much. - Yves */
16184                         assert(ri->regstclass);
16185                         if (ri->regstclass) {
16186                             PerlMemShared_free(ri->regstclass);
16187                             ri->regstclass = 0;
16188                         }
16189                     }
16190                 }
16191                 break;
16192             case 't':
16193                 {
16194                     /* trie structure. */
16195                     U32 refcount;
16196                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16197 #ifdef USE_ITHREADS
16198                     dVAR;
16199 #endif
16200                     OP_REFCNT_LOCK;
16201                     refcount = --trie->refcount;
16202                     OP_REFCNT_UNLOCK;
16203                     if ( !refcount ) {
16204                         PerlMemShared_free(trie->charmap);
16205                         PerlMemShared_free(trie->states);
16206                         PerlMemShared_free(trie->trans);
16207                         if (trie->bitmap)
16208                             PerlMemShared_free(trie->bitmap);
16209                         if (trie->jump)
16210                             PerlMemShared_free(trie->jump);
16211                         PerlMemShared_free(trie->wordinfo);
16212                         /* do this last!!!! */
16213                         PerlMemShared_free(ri->data->data[n]);
16214                     }
16215                 }
16216                 break;
16217             default:
16218                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16219                                                     ri->data->what[n]);
16220             }
16221         }
16222         Safefree(ri->data->what);
16223         Safefree(ri->data);
16224     }
16225
16226     Safefree(ri);
16227 }
16228
16229 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16230 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16231 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16232
16233 /*
16234    re_dup - duplicate a regexp.
16235
16236    This routine is expected to clone a given regexp structure. It is only
16237    compiled under USE_ITHREADS.
16238
16239    After all of the core data stored in struct regexp is duplicated
16240    the regexp_engine.dupe method is used to copy any private data
16241    stored in the *pprivate pointer. This allows extensions to handle
16242    any duplication it needs to do.
16243
16244    See pregfree() and regfree_internal() if you change anything here.
16245 */
16246 #if defined(USE_ITHREADS)
16247 #ifndef PERL_IN_XSUB_RE
16248 void
16249 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16250 {
16251     dVAR;
16252     I32 npar;
16253     const struct regexp *r = ReANY(sstr);
16254     struct regexp *ret = ReANY(dstr);
16255
16256     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16257
16258     npar = r->nparens+1;
16259     Newx(ret->offs, npar, regexp_paren_pair);
16260     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16261
16262     if (ret->substrs) {
16263         /* Do it this way to avoid reading from *r after the StructCopy().
16264            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16265            cache, it doesn't matter.  */
16266         const bool anchored = r->check_substr
16267             ? r->check_substr == r->anchored_substr
16268             : r->check_utf8 == r->anchored_utf8;
16269         Newx(ret->substrs, 1, struct reg_substr_data);
16270         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16271
16272         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16273         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16274         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16275         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16276
16277         /* check_substr and check_utf8, if non-NULL, point to either their
16278            anchored or float namesakes, and don't hold a second reference.  */
16279
16280         if (ret->check_substr) {
16281             if (anchored) {
16282                 assert(r->check_utf8 == r->anchored_utf8);
16283                 ret->check_substr = ret->anchored_substr;
16284                 ret->check_utf8 = ret->anchored_utf8;
16285             } else {
16286                 assert(r->check_substr == r->float_substr);
16287                 assert(r->check_utf8 == r->float_utf8);
16288                 ret->check_substr = ret->float_substr;
16289                 ret->check_utf8 = ret->float_utf8;
16290             }
16291         } else if (ret->check_utf8) {
16292             if (anchored) {
16293                 ret->check_utf8 = ret->anchored_utf8;
16294             } else {
16295                 ret->check_utf8 = ret->float_utf8;
16296             }
16297         }
16298     }
16299
16300     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16301     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16302
16303     if (ret->pprivate)
16304         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16305
16306     if (RX_MATCH_COPIED(dstr))
16307         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16308     else
16309         ret->subbeg = NULL;
16310 #ifdef PERL_ANY_COW
16311     ret->saved_copy = NULL;
16312 #endif
16313
16314     /* Whether mother_re be set or no, we need to copy the string.  We
16315        cannot refrain from copying it when the storage points directly to
16316        our mother regexp, because that's
16317                1: a buffer in a different thread
16318                2: something we no longer hold a reference on
16319                so we need to copy it locally.  */
16320     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16321     ret->mother_re   = NULL;
16322 }
16323 #endif /* PERL_IN_XSUB_RE */
16324
16325 /*
16326    regdupe_internal()
16327
16328    This is the internal complement to regdupe() which is used to copy
16329    the structure pointed to by the *pprivate pointer in the regexp.
16330    This is the core version of the extension overridable cloning hook.
16331    The regexp structure being duplicated will be copied by perl prior
16332    to this and will be provided as the regexp *r argument, however
16333    with the /old/ structures pprivate pointer value. Thus this routine
16334    may override any copying normally done by perl.
16335
16336    It returns a pointer to the new regexp_internal structure.
16337 */
16338
16339 void *
16340 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16341 {
16342     dVAR;
16343     struct regexp *const r = ReANY(rx);
16344     regexp_internal *reti;
16345     int len;
16346     RXi_GET_DECL(r,ri);
16347
16348     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16349
16350     len = ProgLen(ri);
16351
16352     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16353           char, regexp_internal);
16354     Copy(ri->program, reti->program, len+1, regnode);
16355
16356     reti->num_code_blocks = ri->num_code_blocks;
16357     if (ri->code_blocks) {
16358         int n;
16359         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16360                 struct reg_code_block);
16361         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16362                 struct reg_code_block);
16363         for (n = 0; n < ri->num_code_blocks; n++)
16364              reti->code_blocks[n].src_regex = (REGEXP*)
16365                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16366     }
16367     else
16368         reti->code_blocks = NULL;
16369
16370     reti->regstclass = NULL;
16371
16372     if (ri->data) {
16373         struct reg_data *d;
16374         const int count = ri->data->count;
16375         int i;
16376
16377         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16378                 char, struct reg_data);
16379         Newx(d->what, count, U8);
16380
16381         d->count = count;
16382         for (i = 0; i < count; i++) {
16383             d->what[i] = ri->data->what[i];
16384             switch (d->what[i]) {
16385                 /* see also regcomp.h and regfree_internal() */
16386             case 'a': /* actually an AV, but the dup function is identical.  */
16387             case 'r':
16388             case 's':
16389             case 'S':
16390             case 'u': /* actually an HV, but the dup function is identical.  */
16391                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16392                 break;
16393             case 'f':
16394                 /* This is cheating. */
16395                 Newx(d->data[i], 1, regnode_ssc);
16396                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16397                 reti->regstclass = (regnode*)d->data[i];
16398                 break;
16399             case 'T':
16400                 /* Trie stclasses are readonly and can thus be shared
16401                  * without duplication. We free the stclass in pregfree
16402                  * when the corresponding reg_ac_data struct is freed.
16403                  */
16404                 reti->regstclass= ri->regstclass;
16405                 /* FALLTHROUGH */
16406             case 't':
16407                 OP_REFCNT_LOCK;
16408                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16409                 OP_REFCNT_UNLOCK;
16410                 /* FALLTHROUGH */
16411             case 'l':
16412             case 'L':
16413                 d->data[i] = ri->data->data[i];
16414                 break;
16415             default:
16416                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16417                                                            ri->data->what[i]);
16418             }
16419         }
16420
16421         reti->data = d;
16422     }
16423     else
16424         reti->data = NULL;
16425
16426     reti->name_list_idx = ri->name_list_idx;
16427
16428 #ifdef RE_TRACK_PATTERN_OFFSETS
16429     if (ri->u.offsets) {
16430         Newx(reti->u.offsets, 2*len+1, U32);
16431         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16432     }
16433 #else
16434     SetProgLen(reti,len);
16435 #endif
16436
16437     return (void*)reti;
16438 }
16439
16440 #endif    /* USE_ITHREADS */
16441
16442 #ifndef PERL_IN_XSUB_RE
16443
16444 /*
16445  - regnext - dig the "next" pointer out of a node
16446  */
16447 regnode *
16448 Perl_regnext(pTHX_ regnode *p)
16449 {
16450     I32 offset;
16451
16452     if (!p)
16453         return(NULL);
16454
16455     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16456         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16457                                                 (int)OP(p), (int)REGNODE_MAX);
16458     }
16459
16460     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16461     if (offset == 0)
16462         return(NULL);
16463
16464     return(p+offset);
16465 }
16466 #endif
16467
16468 STATIC void
16469 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16470 {
16471     va_list args;
16472     STRLEN l1 = strlen(pat1);
16473     STRLEN l2 = strlen(pat2);
16474     char buf[512];
16475     SV *msv;
16476     const char *message;
16477
16478     PERL_ARGS_ASSERT_RE_CROAK2;
16479
16480     if (l1 > 510)
16481         l1 = 510;
16482     if (l1 + l2 > 510)
16483         l2 = 510 - l1;
16484     Copy(pat1, buf, l1 , char);
16485     Copy(pat2, buf + l1, l2 , char);
16486     buf[l1 + l2] = '\n';
16487     buf[l1 + l2 + 1] = '\0';
16488     va_start(args, pat2);
16489     msv = vmess(buf, &args);
16490     va_end(args);
16491     message = SvPV_const(msv,l1);
16492     if (l1 > 512)
16493         l1 = 512;
16494     Copy(message, buf, l1 , char);
16495     /* l1-1 to avoid \n */
16496     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16497 }
16498
16499 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16500
16501 #ifndef PERL_IN_XSUB_RE
16502 void
16503 Perl_save_re_context(pTHX)
16504 {
16505     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16506     if (PL_curpm) {
16507         const REGEXP * const rx = PM_GETRE(PL_curpm);
16508         if (rx) {
16509             U32 i;
16510             for (i = 1; i <= RX_NPARENS(rx); i++) {
16511                 char digits[TYPE_CHARS(long)];
16512                 const STRLEN len = my_snprintf(digits, sizeof(digits),
16513                                                "%lu", (long)i);
16514                 GV *const *const gvp
16515                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16516
16517                 if (gvp) {
16518                     GV * const gv = *gvp;
16519                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16520                         save_scalar(gv);
16521                 }
16522             }
16523         }
16524     }
16525 }
16526 #endif
16527
16528 #ifdef DEBUGGING
16529
16530 STATIC void
16531 S_put_byte(pTHX_ SV *sv, int c)
16532 {
16533     PERL_ARGS_ASSERT_PUT_BYTE;
16534
16535     if (!isPRINT(c)) {
16536         switch (c) {
16537             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16538             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16539             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16540             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16541             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16542
16543             default:
16544                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16545                 break;
16546         }
16547     }
16548     else {
16549         const char string = c;
16550         if (c == '-' || c == ']' || c == '\\' || c == '^')
16551             sv_catpvs(sv, "\\");
16552         sv_catpvn(sv, &string, 1);
16553     }
16554 }
16555
16556 STATIC void
16557 S_put_range(pTHX_ SV *sv, UV start, UV end)
16558 {
16559
16560     /* Appends to 'sv' a displayable version of the range of code points from
16561      * 'start' to 'end'.  It assumes that only ASCII printables are displayable
16562      * as-is (though some of these will be escaped by put_byte()).  For the
16563      * time being, this subroutine only works for latin1 (< 256) code points */
16564
16565     assert(start <= end);
16566
16567     PERL_ARGS_ASSERT_PUT_RANGE;
16568
16569     while (start <= end) {
16570         if (end - start < 3) {  /* Individual chars in short ranges */
16571             for (; start <= end; start++) {
16572                 put_byte(sv, start);
16573             }
16574             break;
16575         }
16576
16577         /* For small ranges that include printable ASCII characters, it's more
16578          * legible to print those characters rather than hex values.  For
16579          * larger ranges that include more than printables, it's probably
16580          * clearer to just give the start and end points of the range in hex,
16581          * and that's all we can do if there aren't any printables within the
16582          * range
16583          *
16584          * On ASCII platforms the range of printables is contiguous.  If the
16585          * entire range is printable, we print each character as such.  If the
16586          * range is partially printable and partially not, it's less likely
16587          * that the individual printables are meaningful, especially if all or
16588          * almost all of them are in the range.  But we err on the side of the
16589          * individual printables being meaningful by using the hex only if the
16590          * range contains all but 2 of the printables.
16591          *
16592          * On EBCDIC platforms, the printables are scattered around so that the
16593          * maximum range length containing only them is about 10.  Anything
16594          * longer we treat as hex; otherwise we examine the range character by
16595          * character to see */
16596 #ifdef EBCDIC
16597         if (start < 256 && (((end < 255) ? end : 255) - start <= 10))
16598 #else
16599         if ((isPRINT_A(start) && isPRINT_A(end))
16600             || (end >= 0x7F && (isPRINT_A(start) && start > 0x21))
16601             || ((end < 0x7D && isPRINT_A(end)) && start < 0x20))
16602 #endif
16603         {
16604             /* If the range beginning isn't an ASCII printable, we find the
16605              * last such in the range, then split the output, so all the
16606              * non-printables are in one subrange; then process the remaining
16607              * portion as usual.  If the entire range isn't printables, we
16608              * don't split, but drop down to print as hex */
16609             if (! isPRINT_A(start)) {
16610                 UV temp_end = start + 1;
16611                 while (temp_end <= end && ! isPRINT_A(temp_end)) {
16612                     temp_end++;
16613                 }
16614                 if (temp_end <= end) {
16615                     put_range(sv, start, temp_end - 1);
16616                     start = temp_end;
16617                     continue;
16618                 }
16619             }
16620
16621             /* If the range beginning is a digit, output a subrange of just the
16622              * digits, then process the remaining portion as usual */
16623             if (isDIGIT_A(start)) {
16624                 put_byte(sv, start);
16625                 sv_catpvs(sv, "-");
16626                 while (start <= end && isDIGIT_A(start)) start++;
16627                 put_byte(sv, start - 1);
16628                 continue;
16629             }
16630
16631             /* Similarly for alphabetics.  Because in both ASCII and EBCDIC,
16632              * the code points for upper and lower A-Z and a-z aren't
16633              * intermixed, the resulting subrange will consist solely of either
16634              * upper- or lower- alphabetics */
16635             if (isALPHA_A(start)) {
16636                 put_byte(sv, start);
16637                 sv_catpvs(sv, "-");
16638                 while (start <= end && isALPHA_A(start)) start++;
16639                 put_byte(sv, start - 1);
16640                 continue;
16641             }
16642
16643             /* We output any remaining printables as individual characters */
16644             if (isPUNCT_A(start) || isSPACE_A(start)) {
16645                 while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) {
16646                     put_byte(sv, start);
16647                     start++;
16648                 }
16649                 continue;
16650             }
16651         }
16652
16653         /* Here is a control or non-ascii.  Output the range or subrange as
16654          * hex. */
16655         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16656                        start,
16657                        (end < 256) ? end : 255);
16658         break;
16659     }
16660 }
16661
16662 STATIC bool
16663 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16664 {
16665     /* Appends to 'sv' a displayable version of the innards of the bracketed
16666      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16667      * output anything */
16668
16669     int i;
16670     bool has_output_anything = FALSE;
16671
16672     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16673
16674     for (i = 0; i < 256; i++) {
16675         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16676
16677             /* The character at index i should be output.  Find the next
16678              * character that should NOT be output */
16679             int j;
16680             for (j = i + 1; j <= 256; j++) {
16681                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16682                     break;
16683                 }
16684             }
16685
16686             /* Everything between them is a single range that should be output
16687              * */
16688             put_range(sv, i, j - 1);
16689             has_output_anything = TRUE;
16690             i = j;
16691         }
16692     }
16693
16694     return has_output_anything;
16695 }
16696
16697 #define CLEAR_OPTSTART \
16698     if (optstart) STMT_START {                                               \
16699         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16700                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16701         optstart=NULL;                                                       \
16702     } STMT_END
16703
16704 #define DUMPUNTIL(b,e)                                                       \
16705                     CLEAR_OPTSTART;                                          \
16706                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16707
16708 STATIC const regnode *
16709 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16710             const regnode *last, const regnode *plast,
16711             SV* sv, I32 indent, U32 depth)
16712 {
16713     dVAR;
16714     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16715     const regnode *next;
16716     const regnode *optstart= NULL;
16717
16718     RXi_GET_DECL(r,ri);
16719     GET_RE_DEBUG_FLAGS_DECL;
16720
16721     PERL_ARGS_ASSERT_DUMPUNTIL;
16722
16723 #ifdef DEBUG_DUMPUNTIL
16724     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16725         last ? last-start : 0,plast ? plast-start : 0);
16726 #endif
16727
16728     if (plast && plast < last)
16729         last= plast;
16730
16731     while (PL_regkind[op] != END && (!last || node < last)) {
16732         assert(node);
16733         /* While that wasn't END last time... */
16734         NODE_ALIGN(node);
16735         op = OP(node);
16736         if (op == CLOSE || op == WHILEM)
16737             indent--;
16738         next = regnext((regnode *)node);
16739
16740         /* Where, what. */
16741         if (OP(node) == OPTIMIZED) {
16742             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16743                 optstart = node;
16744             else
16745                 goto after_print;
16746         } else
16747             CLEAR_OPTSTART;
16748
16749         regprop(r, sv, node, NULL);
16750         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16751                       (int)(2*indent + 1), "", SvPVX_const(sv));
16752
16753         if (OP(node) != OPTIMIZED) {
16754             if (next == NULL)           /* Next ptr. */
16755                 PerlIO_printf(Perl_debug_log, " (0)");
16756             else if (PL_regkind[(U8)op] == BRANCH
16757                      && PL_regkind[OP(next)] != BRANCH )
16758                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16759             else
16760                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16761             (void)PerlIO_putc(Perl_debug_log, '\n');
16762         }
16763
16764       after_print:
16765         if (PL_regkind[(U8)op] == BRANCHJ) {
16766             assert(next);
16767             {
16768                 const regnode *nnode = (OP(next) == LONGJMP
16769                                        ? regnext((regnode *)next)
16770                                        : next);
16771                 if (last && nnode > last)
16772                     nnode = last;
16773                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16774             }
16775         }
16776         else if (PL_regkind[(U8)op] == BRANCH) {
16777             assert(next);
16778             DUMPUNTIL(NEXTOPER(node), next);
16779         }
16780         else if ( PL_regkind[(U8)op]  == TRIE ) {
16781             const regnode *this_trie = node;
16782             const char op = OP(node);
16783             const U32 n = ARG(node);
16784             const reg_ac_data * const ac = op>=AHOCORASICK ?
16785                (reg_ac_data *)ri->data->data[n] :
16786                NULL;
16787             const reg_trie_data * const trie =
16788                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16789 #ifdef DEBUGGING
16790             AV *const trie_words
16791                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16792 #endif
16793             const regnode *nextbranch= NULL;
16794             I32 word_idx;
16795             sv_setpvs(sv, "");
16796             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16797                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16798
16799                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16800                    (int)(2*(indent+3)), "",
16801                     elem_ptr
16802                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16803                                 SvCUR(*elem_ptr), 60,
16804                                 PL_colors[0], PL_colors[1],
16805                                 (SvUTF8(*elem_ptr)
16806                                  ? PERL_PV_ESCAPE_UNI
16807                                  : 0)
16808                                 | PERL_PV_PRETTY_ELLIPSES
16809                                 | PERL_PV_PRETTY_LTGT
16810                             )
16811                     : "???"
16812                 );
16813                 if (trie->jump) {
16814                     U16 dist= trie->jump[word_idx+1];
16815                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16816                                (UV)((dist ? this_trie + dist : next) - start));
16817                     if (dist) {
16818                         if (!nextbranch)
16819                             nextbranch= this_trie + trie->jump[0];
16820                         DUMPUNTIL(this_trie + dist, nextbranch);
16821                     }
16822                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16823                         nextbranch= regnext((regnode *)nextbranch);
16824                 } else {
16825                     PerlIO_printf(Perl_debug_log, "\n");
16826                 }
16827             }
16828             if (last && next > last)
16829                 node= last;
16830             else
16831                 node= next;
16832         }
16833         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16834             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16835                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16836         }
16837         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16838             assert(next);
16839             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16840         }
16841         else if ( op == PLUS || op == STAR) {
16842             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16843         }
16844         else if (PL_regkind[(U8)op] == ANYOF) {
16845             /* arglen 1 + class block */
16846             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16847                           ? ANYOF_POSIXL_SKIP
16848                           : ANYOF_SKIP);
16849             node = NEXTOPER(node);
16850         }
16851         else if (PL_regkind[(U8)op] == EXACT) {
16852             /* Literal string, where present. */
16853             node += NODE_SZ_STR(node) - 1;
16854             node = NEXTOPER(node);
16855         }
16856         else {
16857             node = NEXTOPER(node);
16858             node += regarglen[(U8)op];
16859         }
16860         if (op == CURLYX || op == OPEN)
16861             indent++;
16862     }
16863     CLEAR_OPTSTART;
16864 #ifdef DEBUG_DUMPUNTIL
16865     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16866 #endif
16867     return node;
16868 }
16869
16870 #endif  /* DEBUGGING */
16871
16872 /*
16873  * Local variables:
16874  * c-indentation-style: bsd
16875  * c-basic-offset: 4
16876  * indent-tabs-mode: nil
16877  * End:
16878  *
16879  * ex: set ts=8 sts=4 sw=4 et:
16880  */