src/5021001/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105
 106 struct RExC_state_t {
 107     U32         flags;                  /* RXf_* are we folding, multilining? */
 108     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object
 113                                            pprivate field */
 114     char        *start;                 /* Start of input for compile */
 115     char        *end;                   /* End of input for compile */
 116     char        *parse;                 /* Input-scan pointer. */
 117     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 118     regnode     *emit_start;            /* Start of emitted-code area */
 119     regnode     *emit_bound;            /* First regnode outside of the
 120                                            allocated space */
 121     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 122                                            implies compiling, so don't emit */
 123     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 124                                            large enough for the largest
 125                                            non-EXACTish node, so can use it as
 126                                            scratch in pass1 */
 127     I32         naughty;                /* How bad is this pattern? */
 128     I32         sawback;                /* Did we see \1, ...? */
 129     U32         seen;
 130     SSize_t     size;                   /* Code size. */
 131     I32                npar;            /* Capture buffer count, (OPEN) plus
 132                                            one. ("par" 0 is the whole
 133                                            pattern)*/
 134     I32         nestroot;               /* root parens we are in - used by
 135                                            accept */
 136     I32         extralen;
 137     I32         seen_zerolen;
 138     regnode     **open_parens;          /* pointers to open parens */
 139     regnode     **close_parens;         /* pointers to close parens */
 140     regnode     *opend;                 /* END node in program */
 141     I32         utf8;           /* whether the pattern is utf8 or not */
 142     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 143                                 /* XXX use this for future optimisation of case
 144                                  * where pattern must be upgraded to utf8. */
 145     I32         uni_semantics;  /* If a d charset modifier should use unicode
 146                                    rules, even if the pattern is not in
 147                                    utf8 */
 148     HV          *paren_names;           /* Paren names */
 149
 150     regnode     **recurse;              /* Recurse regops */
 151     I32         recurse_count;          /* Number of recurse regops */
 152     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 153                                            through */
 154     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 155     I32         in_lookbehind;
 156     I32         contains_locale;
 157     I32         contains_i;
 158     I32         override_recoding;
 159     I32         in_multi_char_class;
 160     struct reg_code_block *code_blocks; /* positions of literal (?{})
 161                                             within pattern */
 162     int         num_code_blocks;        /* size of code_blocks[] */
 163     int         code_index;             /* next code_blocks[] slot */
 164     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 165 #ifdef ADD_TO_REGEXEC
 166     char        *starttry;              /* -Dr: where regtry was called. */
 167 #define RExC_starttry   (pRExC_state->starttry)
 168 #endif
 169     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 170 #ifdef DEBUGGING
 171     const char  *lastparse;
 172     I32         lastnum;
 173     AV          *paren_name_list;       /* idx -> name */
 174 #define RExC_lastparse  (pRExC_state->lastparse)
 175 #define RExC_lastnum    (pRExC_state->lastnum)
 176 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 177 #endif
 178 };
 179
 180 #define RExC_flags      (pRExC_state->flags)
 181 #define RExC_pm_flags   (pRExC_state->pm_flags)
 182 #define RExC_precomp    (pRExC_state->precomp)
 183 #define RExC_rx_sv      (pRExC_state->rx_sv)
 184 #define RExC_rx         (pRExC_state->rx)
 185 #define RExC_rxi        (pRExC_state->rxi)
 186 #define RExC_start      (pRExC_state->start)
 187 #define RExC_end        (pRExC_state->end)
 188 #define RExC_parse      (pRExC_state->parse)
 189 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 190 #ifdef RE_TRACK_PATTERN_OFFSETS
 191 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 192                                                          others */
 193 #endif
 194 #define RExC_emit       (pRExC_state->emit)
 195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty    (pRExC_state->naughty)
 199 #define RExC_sawback    (pRExC_state->sawback)
 200 #define RExC_seen       (pRExC_state->seen)
 201 #define RExC_size       (pRExC_state->size)
 202 #define RExC_maxlen        (pRExC_state->maxlen)
 203 #define RExC_npar       (pRExC_state->npar)
 204 #define RExC_nestroot   (pRExC_state->nestroot)
 205 #define RExC_extralen   (pRExC_state->extralen)
 206 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 207 #define RExC_utf8       (pRExC_state->utf8)
 208 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 209 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 210 #define RExC_open_parens        (pRExC_state->open_parens)
 211 #define RExC_close_parens       (pRExC_state->close_parens)
 212 #define RExC_opend      (pRExC_state->opend)
 213 #define RExC_paren_names        (pRExC_state->paren_names)
 214 #define RExC_recurse    (pRExC_state->recurse)
 215 #define RExC_recurse_count      (pRExC_state->recurse_count)
 216 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 217 #define RExC_study_chunk_recursed_bytes  \
 218                                    (pRExC_state->study_chunk_recursed_bytes)
 219 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 220 #define RExC_contains_locale    (pRExC_state->contains_locale)
 221 #define RExC_contains_i (pRExC_state->contains_i)
 222 #define RExC_override_recoding (pRExC_state->override_recoding)
 223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 224
 225
 226 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 227 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 228         ((*s) == '{' && regcurly(s)))
 229
 230 /*
 231  * Flags to be passed up and down.
 232  */
 233 #define WORST           0       /* Worst case. */
 234 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 235
 236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 237  * character.  (There needs to be a case: in the switch statement in regexec.c
 238  * for any node marked SIMPLE.)  Note that this is not the same thing as
 239  * REGNODE_SIMPLE */
 240 #define SIMPLE          0x02
 241 #define SPSTART         0x04    /* Starts with * or + */
 242 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 243 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 244 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 245
 246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 247
 248 /* whether trie related optimizations are enabled */
 249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 250 #define TRIE_STUDY_OPT
 251 #define FULL_TRIE_STUDY
 252 #define TRIE_STCLASS
 253 #endif
 254
 255
 256
 257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 258 #define PBITVAL(paren) (1 << ((paren) & 7))
 259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 262
 263 #define REQUIRE_UTF8    STMT_START {                                       \
 264                                      if (!UTF) {                           \
 265                                          *flagp = RESTART_UTF8;            \
 266                                          return NULL;                      \
 267                                      }                                     \
 268                         } STMT_END
 269
 270 /* This converts the named class defined in regcomp.h to its equivalent class
 271  * number defined in handy.h. */
 272 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 273 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 274
 275 #define _invlist_union_complement_2nd(a, b, output) \
 276                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 277 #define _invlist_intersection_complement_2nd(a, b, output) \
 278                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 279
 280 /* About scan_data_t.
 281
 282   During optimisation we recurse through the regexp program performing
 283   various inplace (keyhole style) optimisations. In addition study_chunk
 284   and scan_commit populate this data structure with information about
 285   what strings MUST appear in the pattern. We look for the longest
 286   string that must appear at a fixed location, and we look for the
 287   longest string that may appear at a floating location. So for instance
 288   in the pattern:
 289
 290     /FOO[xX]A.*B[xX]BAR/
 291
 292   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 293   strings (because they follow a .* construct). study_chunk will identify
 294   both FOO and BAR as being the longest fixed and floating strings respectively.
 295
 296   The strings can be composites, for instance
 297
 298      /(f)(o)(o)/
 299
 300   will result in a composite fixed substring 'foo'.
 301
 302   For each string some basic information is maintained:
 303
 304   - offset or min_offset
 305     This is the position the string must appear at, or not before.
 306     It also implicitly (when combined with minlenp) tells us how many
 307     characters must match before the string we are searching for.
 308     Likewise when combined with minlenp and the length of the string it
 309     tells us how many characters must appear after the string we have
 310     found.
 311
 312   - max_offset
 313     Only used for floating strings. This is the rightmost point that
 314     the string can appear at. If set to SSize_t_MAX it indicates that the
 315     string can occur infinitely far to the right.
 316
 317   - minlenp
 318     A pointer to the minimum number of characters of the pattern that the
 319     string was found inside. This is important as in the case of positive
 320     lookahead or positive lookbehind we can have multiple patterns
 321     involved. Consider
 322
 323     /(?=FOO).*F/
 324
 325     The minimum length of the pattern overall is 3, the minimum length
 326     of the lookahead part is 3, but the minimum length of the part that
 327     will actually match is 1. So 'FOO's minimum length is 3, but the
 328     minimum length for the F is 1. This is important as the minimum length
 329     is used to determine offsets in front of and behind the string being
 330     looked for.  Since strings can be composites this is the length of the
 331     pattern at the time it was committed with a scan_commit. Note that
 332     the length is calculated by study_chunk, so that the minimum lengths
 333     are not known until the full pattern has been compiled, thus the
 334     pointer to the value.
 335
 336   - lookbehind
 337
 338     In the case of lookbehind the string being searched for can be
 339     offset past the start point of the final matching string.
 340     If this value was just blithely removed from the min_offset it would
 341     invalidate some of the calculations for how many chars must match
 342     before or after (as they are derived from min_offset and minlen and
 343     the length of the string being searched for).
 344     When the final pattern is compiled and the data is moved from the
 345     scan_data_t structure into the regexp structure the information
 346     about lookbehind is factored in, with the information that would
 347     have been lost precalculated in the end_shift field for the
 348     associated string.
 349
 350   The fields pos_min and pos_delta are used to store the minimum offset
 351   and the delta to the maximum offset at the current point in the pattern.
 352
 353 */
 354
 355 typedef struct scan_data_t {
 356     /*I32 len_min;      unused */
 357     /*I32 len_delta;    unused */
 358     SSize_t pos_min;
 359     SSize_t pos_delta;
 360     SV *last_found;
 361     SSize_t last_end;       /* min value, <0 unless valid. */
 362     SSize_t last_start_min;
 363     SSize_t last_start_max;
 364     SV **longest;           /* Either &l_fixed, or &l_float. */
 365     SV *longest_fixed;      /* longest fixed string found in pattern */
 366     SSize_t offset_fixed;   /* offset where it starts */
 367     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 368     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 369     SV *longest_float;      /* longest floating string found in pattern */
 370     SSize_t offset_float_min; /* earliest point in string it can appear */
 371     SSize_t offset_float_max; /* latest point in string it can appear */
 372     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 373     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 374     I32 flags;
 375     I32 whilem_c;
 376     SSize_t *last_closep;
 377     regnode_ssc *start_class;
 378 } scan_data_t;
 379
 380 /* The below is perhaps overboard, but this allows us to save a test at the
 381  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 382  * and 'a' differ by a single bit; the same with the upper and lower case of
 383  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 384  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 385  * then inverts it to form a mask, with just a single 0, in the bit position
 386  * where the upper- and lowercase differ.  XXX There are about 40 other
 387  * instances in the Perl core where this micro-optimization could be used.
 388  * Should decide if maintenance cost is worse, before changing those
 389  *
 390  * Returns a boolean as to whether or not 'v' is either a lowercase or
 391  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 392  * compile-time constant, the generated code is better than some optimizing
 393  * compilers figure out, amounting to a mask and test.  The results are
 394  * meaningless if 'c' is not one of [A-Za-z] */
 395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 396                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 397
 398 /*
 399  * Forward declarations for pregcomp()'s friends.
 400  */
 401
 402 static const scan_data_t zero_scan_data =
 403   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 404
 405 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 406 #define SF_BEFORE_SEOL          0x0001
 407 #define SF_BEFORE_MEOL          0x0002
 408 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 409 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 410
 411 #define SF_FIX_SHIFT_EOL        (+2)
 412 #define SF_FL_SHIFT_EOL         (+4)
 413
 414 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 415 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 416
 417 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 418 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 419 #define SF_IS_INF               0x0040
 420 #define SF_HAS_PAR              0x0080
 421 #define SF_IN_PAR               0x0100
 422 #define SF_HAS_EVAL             0x0200
 423 #define SCF_DO_SUBSTR           0x0400
 424 #define SCF_DO_STCLASS_AND      0x0800
 425 #define SCF_DO_STCLASS_OR       0x1000
 426 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 427 #define SCF_WHILEM_VISITED_POS  0x2000
 428
 429 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 430 #define SCF_SEEN_ACCEPT         0x8000
 431 #define SCF_TRIE_DOING_RESTUDY 0x10000
 432
 433 #define UTF cBOOL(RExC_utf8)
 434
 435 /* The enums for all these are ordered so things work out correctly */
 436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 438                                                      == REGEX_DEPENDS_CHARSET)
 439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 441                                                      >= REGEX_UNICODE_CHARSET)
 442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 443                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 445                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 447                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 448
 449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 450
 451 /* For programs that want to be strictly Unicode compatible by dying if any
 452  * attempt is made to match a non-Unicode code point against a Unicode
 453  * property.  */
 454 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 455
 456 #define OOB_NAMEDCLASS          -1
 457
 458 /* There is no code point that is out-of-bounds, so this is problematic.  But
 459  * its only current use is to initialize a variable that is always set before
 460  * looked at. */
 461 #define OOB_UNICODE             0xDEADBEEF
 462
 463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 465
 466
 467 /* length of regex to show in messages that don't mark a position within */
 468 #define RegexLengthToShowInErrorMessages 127
 469
 470 /*
 471  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 472  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 473  * op/pragma/warn/regcomp.
 474  */
 475 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 476 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 477
 478 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 479                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 480
 481 #define REPORT_LOCATION_ARGS(offset)            \
 482                 UTF8fARG(UTF, offset, RExC_precomp), \
 483                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 487  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 488  * "...".
 489  */
 490 #define _FAIL(code) STMT_START {                                        \
 491     const char *ellipses = "";                                          \
 492     IV len = RExC_end - RExC_precomp;                                   \
 493                                                                         \
 494     if (!SIZE_ONLY)                                                     \
 495         SAVEFREESV(RExC_rx_sv);                                         \
 496     if (len > RegexLengthToShowInErrorMessages) {                       \
 497         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 498         len = RegexLengthToShowInErrorMessages - 10;                    \
 499         ellipses = "...";                                               \
 500     }                                                                   \
 501     code;                                                               \
 502 } STMT_END
 503
 504 #define FAIL(msg) _FAIL(                            \
 505     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 506             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 507
 508 #define FAIL2(msg,arg) _FAIL(                       \
 509     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 510             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 511
 512 /*
 513  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 514  */
 515 #define Simple_vFAIL(m) STMT_START {                                    \
 516     const IV offset = RExC_parse - RExC_precomp;                        \
 517     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 518             m, REPORT_LOCATION_ARGS(offset));   \
 519 } STMT_END
 520
 521 /*
 522  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 523  */
 524 #define vFAIL(m) STMT_START {                           \
 525     if (!SIZE_ONLY)                                     \
 526         SAVEFREESV(RExC_rx_sv);                         \
 527     Simple_vFAIL(m);                                    \
 528 } STMT_END
 529
 530 /*
 531  * Like Simple_vFAIL(), but accepts two arguments.
 532  */
 533 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 534     const IV offset = RExC_parse - RExC_precomp;                        \
 535     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 536                       REPORT_LOCATION_ARGS(offset));    \
 537 } STMT_END
 538
 539 /*
 540  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 541  */
 542 #define vFAIL2(m,a1) STMT_START {                       \
 543     if (!SIZE_ONLY)                                     \
 544         SAVEFREESV(RExC_rx_sv);                         \
 545     Simple_vFAIL2(m, a1);                               \
 546 } STMT_END
 547
 548
 549 /*
 550  * Like Simple_vFAIL(), but accepts three arguments.
 551  */
 552 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 553     const IV offset = RExC_parse - RExC_precomp;                \
 554     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 555             REPORT_LOCATION_ARGS(offset));      \
 556 } STMT_END
 557
 558 /*
 559  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 560  */
 561 #define vFAIL3(m,a1,a2) STMT_START {                    \
 562     if (!SIZE_ONLY)                                     \
 563         SAVEFREESV(RExC_rx_sv);                         \
 564     Simple_vFAIL3(m, a1, a2);                           \
 565 } STMT_END
 566
 567 /*
 568  * Like Simple_vFAIL(), but accepts four arguments.
 569  */
 570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 571     const IV offset = RExC_parse - RExC_precomp;                \
 572     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 573             REPORT_LOCATION_ARGS(offset));      \
 574 } STMT_END
 575
 576 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 577     if (!SIZE_ONLY)                                     \
 578         SAVEFREESV(RExC_rx_sv);                         \
 579     Simple_vFAIL4(m, a1, a2, a3);                       \
 580 } STMT_END
 581
 582 /* A specialized version of vFAIL2 that works with UTF8f */
 583 #define vFAIL2utf8f(m, a1) STMT_START { \
 584     const IV offset = RExC_parse - RExC_precomp;   \
 585     if (!SIZE_ONLY)                                \
 586         SAVEFREESV(RExC_rx_sv);                    \
 587     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 588             REPORT_LOCATION_ARGS(offset));         \
 589 } STMT_END
 590
 591
 592 /* m is not necessarily a "literal string", in this macro */
 593 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 594     const IV offset = loc - RExC_precomp;                               \
 595     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 596             m, REPORT_LOCATION_ARGS(offset));       \
 597 } STMT_END
 598
 599 #define ckWARNreg(loc,m) STMT_START {                                   \
 600     const IV offset = loc - RExC_precomp;                               \
 601     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 602             REPORT_LOCATION_ARGS(offset));              \
 603 } STMT_END
 604
 605 #define vWARN_dep(loc, m) STMT_START {                                  \
 606     const IV offset = loc - RExC_precomp;                               \
 607     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 608             REPORT_LOCATION_ARGS(offset));              \
 609 } STMT_END
 610
 611 #define ckWARNdep(loc,m) STMT_START {                                   \
 612     const IV offset = loc - RExC_precomp;                               \
 613     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 614             m REPORT_LOCATION,                                          \
 615             REPORT_LOCATION_ARGS(offset));              \
 616 } STMT_END
 617
 618 #define ckWARNregdep(loc,m) STMT_START {                                \
 619     const IV offset = loc - RExC_precomp;                               \
 620     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 621             m REPORT_LOCATION,                                          \
 622             REPORT_LOCATION_ARGS(offset));              \
 623 } STMT_END
 624
 625 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 626     const IV offset = loc - RExC_precomp;                               \
 627     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 628             m REPORT_LOCATION,                                          \
 629             a1, REPORT_LOCATION_ARGS(offset));  \
 630 } STMT_END
 631
 632 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 633     const IV offset = loc - RExC_precomp;                               \
 634     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 635             a1, REPORT_LOCATION_ARGS(offset));  \
 636 } STMT_END
 637
 638 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 639     const IV offset = loc - RExC_precomp;                               \
 640     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 641             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 642 } STMT_END
 643
 644 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 645     const IV offset = loc - RExC_precomp;                               \
 646     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 647             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 648 } STMT_END
 649
 650 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 651     const IV offset = loc - RExC_precomp;                               \
 652     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 653             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 654 } STMT_END
 655
 656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 657     const IV offset = loc - RExC_precomp;                               \
 658     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 659             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 660 } STMT_END
 661
 662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 663     const IV offset = loc - RExC_precomp;                               \
 664     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 665             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 666 } STMT_END
 667
 668
 669 /* Allow for side effects in s */
 670 #define REGC(c,s) STMT_START {                  \
 671     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 672 } STMT_END
 673
 674 /* Macros for recording node offsets.   20001227 mjd@plover.com
 675  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 676  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 677  * Element 0 holds the number n.
 678  * Position is 1 indexed.
 679  */
 680 #ifndef RE_TRACK_PATTERN_OFFSETS
 681 #define Set_Node_Offset_To_R(node,byte)
 682 #define Set_Node_Offset(node,byte)
 683 #define Set_Cur_Node_Offset
 684 #define Set_Node_Length_To_R(node,len)
 685 #define Set_Node_Length(node,len)
 686 #define Set_Node_Cur_Length(node,start)
 687 #define Node_Offset(n)
 688 #define Node_Length(n)
 689 #define Set_Node_Offset_Length(node,offset,len)
 690 #define ProgLen(ri) ri->u.proglen
 691 #define SetProgLen(ri,x) ri->u.proglen = x
 692 #else
 693 #define ProgLen(ri) ri->u.offsets[0]
 694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 695 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 696     if (! SIZE_ONLY) {                                                  \
 697         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 698                     __LINE__, (int)(node), (int)(byte)));               \
 699         if((node) < 0) {                                                \
 700             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 701                                          (int)(node));                  \
 702         } else {                                                        \
 703             RExC_offsets[2*(node)-1] = (byte);                          \
 704         }                                                               \
 705     }                                                                   \
 706 } STMT_END
 707
 708 #define Set_Node_Offset(node,byte) \
 709     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 711
 712 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 713     if (! SIZE_ONLY) {                                                  \
 714         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 715                 __LINE__, (int)(node), (int)(len)));                    \
 716         if((node) < 0) {                                                \
 717             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 718                                          (int)(node));                  \
 719         } else {                                                        \
 720             RExC_offsets[2*(node)] = (len);                             \
 721         }                                                               \
 722     }                                                                   \
 723 } STMT_END
 724
 725 #define Set_Node_Length(node,len) \
 726     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 727 #define Set_Node_Cur_Length(node, start)                \
 728     Set_Node_Length(node, RExC_parse - start)
 729
 730 /* Get offsets and lengths */
 731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 733
 734 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 735     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 736     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 737 } STMT_END
 738 #endif
 739
 740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 741 #define EXPERIMENTAL_INPLACESCAN
 742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 743
 744 #define DEBUG_RExC_seen() \
 745         DEBUG_OPTIMISE_MORE_r({                                             \
 746             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 747                                                                             \
 748             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 749                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 750                                                                             \
 751             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 752                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 753                                                                             \
 754             if (RExC_seen & REG_GPOS_SEEN)                                  \
 755                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 756                                                                             \
 757             if (RExC_seen & REG_CANY_SEEN)                                  \
 758                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 759                                                                             \
 760             if (RExC_seen & REG_RECURSE_SEEN)                               \
 761                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 762                                                                             \
 763             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 764                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 765                                                                             \
 766             if (RExC_seen & REG_VERBARG_SEEN)                               \
 767                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 768                                                                             \
 769             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 770                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 771                                                                             \
 772             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 773                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 774                                                                             \
 775             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 776                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 777                                                                             \
 778             if (RExC_seen & REG_GOSTART_SEEN)                               \
 779                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 780                                                                             \
 781             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 782                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 783                                                                             \
 784             PerlIO_printf(Perl_debug_log,"\n");                             \
 785         });
 786
 787 #define DEBUG_STUDYDATA(str,data,depth)                              \
 788 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 789     PerlIO_printf(Perl_debug_log,                                    \
 790         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 791         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 792         (int)(depth)*2, "",                                          \
 793         (IV)((data)->pos_min),                                       \
 794         (IV)((data)->pos_delta),                                     \
 795         (UV)((data)->flags),                                         \
 796         (IV)((data)->whilem_c),                                      \
 797         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 798         is_inf ? "INF " : ""                                         \
 799     );                                                               \
 800     if ((data)->last_found)                                          \
 801         PerlIO_printf(Perl_debug_log,                                \
 802             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 803             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 804             SvPVX_const((data)->last_found),                         \
 805             (IV)((data)->last_end),                                  \
 806             (IV)((data)->last_start_min),                            \
 807             (IV)((data)->last_start_max),                            \
 808             ((data)->longest &&                                      \
 809              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 810             SvPVX_const((data)->longest_fixed),                      \
 811             (IV)((data)->offset_fixed),                              \
 812             ((data)->longest &&                                      \
 813              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 814             SvPVX_const((data)->longest_float),                      \
 815             (IV)((data)->offset_float_min),                          \
 816             (IV)((data)->offset_float_max)                           \
 817         );                                                           \
 818     PerlIO_printf(Perl_debug_log,"\n");                              \
 819 });
 820
 821 /* Mark that we cannot extend a found fixed substring at this point.
 822    Update the longest found anchored substring and the longest found
 823    floating substrings if needed. */
 824
 825 STATIC void
 826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 827                     SSize_t *minlenp, int is_inf)
 828 {
 829     const STRLEN l = CHR_SVLEN(data->last_found);
 830     const STRLEN old_l = CHR_SVLEN(*data->longest);
 831     GET_RE_DEBUG_FLAGS_DECL;
 832
 833     PERL_ARGS_ASSERT_SCAN_COMMIT;
 834
 835     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 836         SvSetMagicSV(*data->longest, data->last_found);
 837         if (*data->longest == data->longest_fixed) {
 838             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 839             if (data->flags & SF_BEFORE_EOL)
 840                 data->flags
 841                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 842             else
 843                 data->flags &= ~SF_FIX_BEFORE_EOL;
 844             data->minlen_fixed=minlenp;
 845             data->lookbehind_fixed=0;
 846         }
 847         else { /* *data->longest == data->longest_float */
 848             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 849             data->offset_float_max = (l
 850                                       ? data->last_start_max
 851                                       : (data->pos_delta == SSize_t_MAX
 852                                          ? SSize_t_MAX
 853                                          : data->pos_min + data->pos_delta));
 854             if (is_inf
 855                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 856                 data->offset_float_max = SSize_t_MAX;
 857             if (data->flags & SF_BEFORE_EOL)
 858                 data->flags
 859                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 860             else
 861                 data->flags &= ~SF_FL_BEFORE_EOL;
 862             data->minlen_float=minlenp;
 863             data->lookbehind_float=0;
 864         }
 865     }
 866     SvCUR_set(data->last_found, 0);
 867     {
 868         SV * const sv = data->last_found;
 869         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 870             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 871             if (mg)
 872                 mg->mg_len = 0;
 873         }
 874     }
 875     data->last_end = -1;
 876     data->flags &= ~SF_BEFORE_EOL;
 877     DEBUG_STUDYDATA("commit: ",data,0);
 878 }
 879
 880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 881  * list that describes which code points it matches */
 882
 883 STATIC void
 884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 885 {
 886     /* Set the SSC 'ssc' to match an empty string or any code point */
 887
 888     PERL_ARGS_ASSERT_SSC_ANYTHING;
 889
 890     assert(is_ANYOF_SYNTHETIC(ssc));
 891
 892     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 893     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 894     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 895 }
 896
 897 STATIC int
 898 S_ssc_is_anything(const regnode_ssc *ssc)
 899 {
 900     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 901      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 902      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 903      * in any way, so there's no point in using it */
 904
 905     UV start, end;
 906     bool ret;
 907
 908     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 909
 910     assert(is_ANYOF_SYNTHETIC(ssc));
 911
 912     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 913         return FALSE;
 914     }
 915
 916     /* See if the list consists solely of the range 0 - Infinity */
 917     invlist_iterinit(ssc->invlist);
 918     ret = invlist_iternext(ssc->invlist, &start, &end)
 919           && start == 0
 920           && end == UV_MAX;
 921
 922     invlist_iterfinish(ssc->invlist);
 923
 924     if (ret) {
 925         return TRUE;
 926     }
 927
 928     /* If e.g., both \w and \W are set, matches everything */
 929     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 930         int i;
 931         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 932             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 933                 return TRUE;
 934             }
 935         }
 936     }
 937
 938     return FALSE;
 939 }
 940
 941 STATIC void
 942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 943 {
 944     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 945      * string, any code point, or any posix class under locale */
 946
 947     PERL_ARGS_ASSERT_SSC_INIT;
 948
 949     Zero(ssc, 1, regnode_ssc);
 950     set_ANYOF_SYNTHETIC(ssc);
 951     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 952     ssc_anything(ssc);
 953
 954     /* If any portion of the regex is to operate under locale rules,
 955      * initialization includes it.  The reason this isn't done for all regexes
 956      * is that the optimizer was written under the assumption that locale was
 957      * all-or-nothing.  Given the complexity and lack of documentation in the
 958      * optimizer, and that there are inadequate test cases for locale, many
 959      * parts of it may not work properly, it is safest to avoid locale unless
 960      * necessary. */
 961     if (RExC_contains_locale) {
 962         ANYOF_POSIXL_SETALL(ssc);
 963     }
 964     else {
 965         ANYOF_POSIXL_ZERO(ssc);
 966     }
 967 }
 968
 969 STATIC int
 970 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
 971                         const regnode_ssc *ssc)
 972 {
 973     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 974      * to the list of code points matched, and locale posix classes; hence does
 975      * not check its flags) */
 976
 977     UV start, end;
 978     bool ret;
 979
 980     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 981
 982     assert(is_ANYOF_SYNTHETIC(ssc));
 983
 984     invlist_iterinit(ssc->invlist);
 985     ret = invlist_iternext(ssc->invlist, &start, &end)
 986           && start == 0
 987           && end == UV_MAX;
 988
 989     invlist_iterfinish(ssc->invlist);
 990
 991     if (! ret) {
 992         return FALSE;
 993     }
 994
 995     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 996         return FALSE;
 997     }
 998
 999     return TRUE;
1000 }
1001
1002 STATIC SV*
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004                                const regnode_charclass* const node)
1005 {
1006     /* Returns a mortal inversion list defining which code points are matched
1007      * by 'node', which is of type ANYOF.  Handles complementing the result if
1008      * appropriate.  If some code points aren't knowable at this time, the
1009      * returned list must, and will, contain every code point that is a
1010      * possibility. */
1011
1012     SV* invlist = sv_2mortal(_new_invlist(0));
1013     SV* only_utf8_locale_invlist = NULL;
1014     unsigned int i;
1015     const U32 n = ARG(node);
1016     bool new_node_has_latin1 = FALSE;
1017
1018     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1019
1020     /* Look at the data structure created by S_set_ANYOF_arg() */
1021     if (n != ANYOF_NONBITMAP_EMPTY) {
1022         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023         AV * const av = MUTABLE_AV(SvRV(rv));
1024         SV **const ary = AvARRAY(av);
1025         assert(RExC_rxi->data->what[n] == 's');
1026
1027         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1029         }
1030         else if (ary[0] && ary[0] != &PL_sv_undef) {
1031
1032             /* Here, no compile-time swash, and there are things that won't be
1033              * known until runtime -- we have to assume it could be anything */
1034             return _add_range_to_invlist(invlist, 0, UV_MAX);
1035         }
1036         else if (ary[3] && ary[3] != &PL_sv_undef) {
1037
1038             /* Here no compile-time swash, and no run-time only data.  Use the
1039              * node's inversion list */
1040             invlist = sv_2mortal(invlist_clone(ary[3]));
1041         }
1042
1043         /* Get the code points valid only under UTF-8 locales */
1044         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045             && ary[2] && ary[2] != &PL_sv_undef)
1046         {
1047             only_utf8_locale_invlist = ary[2];
1048         }
1049     }
1050
1051     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052      * inversion list for the others, but if there are code points that should
1053      * match only conditionally on the target string being UTF-8, those are
1054      * placed in the inversion list, and not the bitmap.  Since there are
1055      * circumstances under which they could match, they are included in the
1056      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1057      * here, so that when we invert below, the end result actually does include
1058      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1059      * before we add the unconditionally matched code points */
1060     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061         _invlist_intersection_complement_2nd(invlist,
1062                                              PL_UpperLatin1,
1063                                              &invlist);
1064     }
1065
1066     /* Add in the points from the bit map */
1067     for (i = 0; i < 256; i++) {
1068         if (ANYOF_BITMAP_TEST(node, i)) {
1069             invlist = add_cp_to_invlist(invlist, i);
1070             new_node_has_latin1 = TRUE;
1071         }
1072     }
1073
1074     /* If this can match all upper Latin1 code points, have to add them
1075      * as well */
1076     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1078     }
1079
1080     /* Similarly for these */
1081     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1083     }
1084
1085     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086         _invlist_invert(invlist);
1087     }
1088     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1089
1090         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091          * locale.  We can skip this if there are no 0-255 at all. */
1092         _invlist_union(invlist, PL_Latin1, &invlist);
1093     }
1094
1095     /* Similarly add the UTF-8 locale possible matches.  These have to be
1096      * deferred until after the non-UTF-8 locale ones are taken care of just
1097      * above, or it leads to wrong results under ANYOF_INVERT */
1098     if (only_utf8_locale_invlist) {
1099         _invlist_union_maybe_complement_2nd(invlist,
1100                                             only_utf8_locale_invlist,
1101                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1102                                             &invlist);
1103     }
1104
1105     return invlist;
1106 }
1107
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero           ssc_init
1110
1111 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1113
1114 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1115  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1116  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1117
1118 STATIC void
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120                 const regnode_charclass *and_with)
1121 {
1122     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123      * another SSC or a regular ANYOF class.  Can create false positives. */
1124
1125     SV* anded_cp_list;
1126     U8  anded_flags;
1127
1128     PERL_ARGS_ASSERT_SSC_AND;
1129
1130     assert(is_ANYOF_SYNTHETIC(ssc));
1131
1132     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133      * the code point inversion list and just the relevant flags */
1134     if (is_ANYOF_SYNTHETIC(and_with)) {
1135         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136         anded_flags = ANYOF_FLAGS(and_with);
1137
1138         /* XXX This is a kludge around what appears to be deficiencies in the
1139          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1140          * there are paths through the optimizer where it doesn't get weeded
1141          * out when it should.  And if we don't make some extra provision for
1142          * it like the code just below, it doesn't get added when it should.
1143          * This solution is to add it only when AND'ing, which is here, and
1144          * only when what is being AND'ed is the pristine, original node
1145          * matching anything.  Thus it is like adding it to ssc_anything() but
1146          * only when the result is to be AND'ed.  Probably the same solution
1147          * could be adopted for the same problem we have with /l matching,
1148          * which is solved differently in S_ssc_init(), and that would lead to
1149          * fewer false positives than that solution has.  But if this solution
1150          * creates bugs, the consequences are only that a warning isn't raised
1151          * that should be; while the consequences for having /l bugs is
1152          * incorrect matches */
1153         if (ssc_is_anything((regnode_ssc *)and_with)) {
1154             anded_flags |= ANYOF_WARN_SUPER;
1155         }
1156     }
1157     else {
1158         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1160     }
1161
1162     ANYOF_FLAGS(ssc) &= anded_flags;
1163
1164     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166      * 'and_with' may be inverted.  When not inverted, we have the situation of
1167      * computing:
1168      *  (C1 | P1) & (C2 | P2)
1169      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1172      *                    <=  ((C1 & C2) | P1 | P2)
1173      * Alternatively, the last few steps could be:
1174      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1176      *                    <=  (C1 | C2 | (P1 & P2))
1177      * We favor the second approach if either P1 or P2 is non-empty.  This is
1178      * because these components are a barrier to doing optimizations, as what
1179      * they match cannot be known until the moment of matching as they are
1180      * dependent on the current locale, 'AND"ing them likely will reduce or
1181      * eliminate them.
1182      * But we can do better if we know that C1,P1 are in their initial state (a
1183      * frequent occurrence), each matching everything:
1184      *  (<everything>) & (C2 | P2) =  C2 | P2
1185      * Similarly, if C2,P2 are in their initial state (again a frequent
1186      * occurrence), the result is a no-op
1187      *  (C1 | P1) & (<everything>) =  C1 | P1
1188      *
1189      * Inverted, we have
1190      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1191      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1193      * */
1194
1195     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196         && ! is_ANYOF_SYNTHETIC(and_with))
1197     {
1198         unsigned int i;
1199
1200         ssc_intersection(ssc,
1201                          anded_cp_list,
1202                          FALSE /* Has already been inverted */
1203                          );
1204
1205         /* If either P1 or P2 is empty, the intersection will be also; can skip
1206          * the loop */
1207         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208             ANYOF_POSIXL_ZERO(ssc);
1209         }
1210         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1211
1212             /* Note that the Posix class component P from 'and_with' actually
1213              * looks like:
1214              *      P = Pa | Pb | ... | Pn
1215              * where each component is one posix class, such as in [\w\s].
1216              * Thus
1217              *      ~P = ~(Pa | Pb | ... | Pn)
1218              *         = ~Pa & ~Pb & ... & ~Pn
1219              *        <= ~Pa | ~Pb | ... | ~Pn
1220              * The last is something we can easily calculate, but unfortunately
1221              * is likely to have many false positives.  We could do better
1222              * in some (but certainly not all) instances if two classes in
1223              * P have known relationships.  For example
1224              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1225              * So
1226              *      :lower: & :print: = :lower:
1227              * And similarly for classes that must be disjoint.  For example,
1228              * since \s and \w can have no elements in common based on rules in
1229              * the POSIX standard,
1230              *      \w & ^\S = nothing
1231              * Unfortunately, some vendor locales do not meet the Posix
1232              * standard, in particular almost everything by Microsoft.
1233              * The loop below just changes e.g., \w into \W and vice versa */
1234
1235             regnode_charclass_posixl temp;
1236             int add = 1;    /* To calculate the index of the complement */
1237
1238             ANYOF_POSIXL_ZERO(&temp);
1239             for (i = 0; i < ANYOF_MAX; i++) {
1240                 assert(i % 2 != 0
1241                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1243
1244                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245                     ANYOF_POSIXL_SET(&temp, i + add);
1246                 }
1247                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1248             }
1249             ANYOF_POSIXL_AND(&temp, ssc);
1250
1251         } /* else ssc already has no posixes */
1252     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1253          in its initial state */
1254     else if (! is_ANYOF_SYNTHETIC(and_with)
1255              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1256     {
1257         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258          * copy it over 'ssc' */
1259         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260             if (is_ANYOF_SYNTHETIC(and_with)) {
1261                 StructCopy(and_with, ssc, regnode_ssc);
1262             }
1263             else {
1264                 ssc->invlist = anded_cp_list;
1265                 ANYOF_POSIXL_ZERO(ssc);
1266                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1268                 }
1269             }
1270         }
1271         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1273         {
1274             /* One or the other of P1, P2 is non-empty. */
1275             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1277             }
1278             ssc_union(ssc, anded_cp_list, FALSE);
1279         }
1280         else { /* P1 = P2 = empty */
1281             ssc_intersection(ssc, anded_cp_list, FALSE);
1282         }
1283     }
1284 }
1285
1286 STATIC void
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288                const regnode_charclass *or_with)
1289 {
1290     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291      * another SSC or a regular ANYOF class.  Can create false positives if
1292      * 'or_with' is to be inverted. */
1293
1294     SV* ored_cp_list;
1295     U8 ored_flags;
1296
1297     PERL_ARGS_ASSERT_SSC_OR;
1298
1299     assert(is_ANYOF_SYNTHETIC(ssc));
1300
1301     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302      * the code point inversion list and just the relevant flags */
1303     if (is_ANYOF_SYNTHETIC(or_with)) {
1304         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305         ored_flags = ANYOF_FLAGS(or_with);
1306     }
1307     else {
1308         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1310     }
1311
1312     ANYOF_FLAGS(ssc) |= ored_flags;
1313
1314     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316      * 'or_with' may be inverted.  When not inverted, we have the simple
1317      * situation of computing:
1318      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1319      * If P1|P2 yields a situation with both a class and its complement are
1320      * set, like having both \w and \W, this matches all code points, and we
1321      * can delete these from the P component of the ssc going forward.  XXX We
1322      * might be able to delete all the P components, but I (khw) am not certain
1323      * about this, and it is better to be safe.
1324      *
1325      * Inverted, we have
1326      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1327      *                         <=  (C1 | P1) | ~C2
1328      *                         <=  (C1 | ~C2) | P1
1329      * (which results in actually simpler code than the non-inverted case)
1330      * */
1331
1332     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333         && ! is_ANYOF_SYNTHETIC(or_with))
1334     {
1335         /* We ignore P2, leaving P1 going forward */
1336     }   /* else  Not inverted */
1337     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1340             unsigned int i;
1341             for (i = 0; i < ANYOF_MAX; i += 2) {
1342                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1343                 {
1344                     ssc_match_all_cp(ssc);
1345                     ANYOF_POSIXL_CLEAR(ssc, i);
1346                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1347                 }
1348             }
1349         }
1350     }
1351
1352     ssc_union(ssc,
1353               ored_cp_list,
1354               FALSE /* Already has been inverted */
1355               );
1356 }
1357
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1360 {
1361     PERL_ARGS_ASSERT_SSC_UNION;
1362
1363     assert(is_ANYOF_SYNTHETIC(ssc));
1364
1365     _invlist_union_maybe_complement_2nd(ssc->invlist,
1366                                         invlist,
1367                                         invert2nd,
1368                                         &ssc->invlist);
1369 }
1370
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1373                          SV* const invlist,
1374                          const bool invert2nd)
1375 {
1376     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1377
1378     assert(is_ANYOF_SYNTHETIC(ssc));
1379
1380     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1381                                                invlist,
1382                                                invert2nd,
1383                                                &ssc->invlist);
1384 }
1385
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1388 {
1389     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1390
1391     assert(is_ANYOF_SYNTHETIC(ssc));
1392
1393     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1394 }
1395
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1398 {
1399     /* AND just the single code point 'cp' into the SSC 'ssc' */
1400
1401     SV* cp_list = _new_invlist(2);
1402
1403     PERL_ARGS_ASSERT_SSC_CP_AND;
1404
1405     assert(is_ANYOF_SYNTHETIC(ssc));
1406
1407     cp_list = add_cp_to_invlist(cp_list, cp);
1408     ssc_intersection(ssc, cp_list,
1409                      FALSE /* Not inverted */
1410                      );
1411     SvREFCNT_dec_NN(cp_list);
1412 }
1413
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(regnode_ssc *ssc)
1416 {
1417     /* Set the SSC 'ssc' to not match any locale things */
1418     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1419
1420     assert(is_ANYOF_SYNTHETIC(ssc));
1421
1422     ANYOF_POSIXL_ZERO(ssc);
1423     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1424 }
1425
1426 STATIC void
1427 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1428 {
1429     /* The inversion list in the SSC is marked mortal; now we need a more
1430      * permanent copy, which is stored the same way that is done in a regular
1431      * ANYOF node, with the first 256 code points in a bit map */
1432
1433     SV* invlist = invlist_clone(ssc->invlist);
1434
1435     PERL_ARGS_ASSERT_SSC_FINALIZE;
1436
1437     assert(is_ANYOF_SYNTHETIC(ssc));
1438
1439     /* The code in this file assumes that all but these flags aren't relevant
1440      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1441      * time we reach here */
1442     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1443
1444     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1445
1446     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1447                                 NULL, NULL, NULL, FALSE);
1448
1449     /* Make sure is clone-safe */
1450     ssc->invlist = NULL;
1451
1452     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1453         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1454     }
1455
1456     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1457 }
1458
1459 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1460 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1461 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1462 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1463                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1464                                : 0 )
1465
1466
1467 #ifdef DEBUGGING
1468 /*
1469    dump_trie(trie,widecharmap,revcharmap)
1470    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1471    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1472
1473    These routines dump out a trie in a somewhat readable format.
1474    The _interim_ variants are used for debugging the interim
1475    tables that are used to generate the final compressed
1476    representation which is what dump_trie expects.
1477
1478    Part of the reason for their existence is to provide a form
1479    of documentation as to how the different representations function.
1480
1481 */
1482
1483 /*
1484   Dumps the final compressed table form of the trie to Perl_debug_log.
1485   Used for debugging make_trie().
1486 */
1487
1488 STATIC void
1489 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1490             AV *revcharmap, U32 depth)
1491 {
1492     U32 state;
1493     SV *sv=sv_newmortal();
1494     int colwidth= widecharmap ? 6 : 4;
1495     U16 word;
1496     GET_RE_DEBUG_FLAGS_DECL;
1497
1498     PERL_ARGS_ASSERT_DUMP_TRIE;
1499
1500     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1501         (int)depth * 2 + 2,"",
1502         "Match","Base","Ofs" );
1503
1504     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1505         SV ** const tmp = av_fetch( revcharmap, state, 0);
1506         if ( tmp ) {
1507             PerlIO_printf( Perl_debug_log, "%*s",
1508                 colwidth,
1509                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1510                             PL_colors[0], PL_colors[1],
1511                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1512                             PERL_PV_ESCAPE_FIRSTCHAR
1513                 )
1514             );
1515         }
1516     }
1517     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1518         (int)depth * 2 + 2,"");
1519
1520     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1521         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1522     PerlIO_printf( Perl_debug_log, "\n");
1523
1524     for( state = 1 ; state < trie->statecount ; state++ ) {
1525         const U32 base = trie->states[ state ].trans.base;
1526
1527         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1528                                        (int)depth * 2 + 2,"", (UV)state);
1529
1530         if ( trie->states[ state ].wordnum ) {
1531             PerlIO_printf( Perl_debug_log, " W%4X",
1532                                            trie->states[ state ].wordnum );
1533         } else {
1534             PerlIO_printf( Perl_debug_log, "%6s", "" );
1535         }
1536
1537         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1538
1539         if ( base ) {
1540             U32 ofs = 0;
1541
1542             while( ( base + ofs  < trie->uniquecharcount ) ||
1543                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1544                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1545                                                                     != state))
1546                     ofs++;
1547
1548             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1549
1550             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1551                 if ( ( base + ofs >= trie->uniquecharcount )
1552                         && ( base + ofs - trie->uniquecharcount
1553                                                         < trie->lasttrans )
1554                         && trie->trans[ base + ofs
1555                                     - trie->uniquecharcount ].check == state )
1556                 {
1557                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1558                     colwidth,
1559                     (UV)trie->trans[ base + ofs
1560                                              - trie->uniquecharcount ].next );
1561                 } else {
1562                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1563                 }
1564             }
1565
1566             PerlIO_printf( Perl_debug_log, "]");
1567
1568         }
1569         PerlIO_printf( Perl_debug_log, "\n" );
1570     }
1571     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1572                                 (int)depth*2, "");
1573     for (word=1; word <= trie->wordcount; word++) {
1574         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1575             (int)word, (int)(trie->wordinfo[word].prev),
1576             (int)(trie->wordinfo[word].len));
1577     }
1578     PerlIO_printf(Perl_debug_log, "\n" );
1579 }
1580 /*
1581   Dumps a fully constructed but uncompressed trie in list form.
1582   List tries normally only are used for construction when the number of
1583   possible chars (trie->uniquecharcount) is very high.
1584   Used for debugging make_trie().
1585 */
1586 STATIC void
1587 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1588                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1589                          U32 depth)
1590 {
1591     U32 state;
1592     SV *sv=sv_newmortal();
1593     int colwidth= widecharmap ? 6 : 4;
1594     GET_RE_DEBUG_FLAGS_DECL;
1595
1596     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1597
1598     /* print out the table precompression.  */
1599     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1600         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1601         "------:-----+-----------------\n" );
1602
1603     for( state=1 ; state < next_alloc ; state ++ ) {
1604         U16 charid;
1605
1606         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1607             (int)depth * 2 + 2,"", (UV)state  );
1608         if ( ! trie->states[ state ].wordnum ) {
1609             PerlIO_printf( Perl_debug_log, "%5s| ","");
1610         } else {
1611             PerlIO_printf( Perl_debug_log, "W%4x| ",
1612                 trie->states[ state ].wordnum
1613             );
1614         }
1615         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1616             SV ** const tmp = av_fetch( revcharmap,
1617                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1618             if ( tmp ) {
1619                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1620                     colwidth,
1621                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1622                               colwidth,
1623                               PL_colors[0], PL_colors[1],
1624                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1625                               | PERL_PV_ESCAPE_FIRSTCHAR
1626                     ) ,
1627                     TRIE_LIST_ITEM(state,charid).forid,
1628                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1629                 );
1630                 if (!(charid % 10))
1631                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1632                         (int)((depth * 2) + 14), "");
1633             }
1634         }
1635         PerlIO_printf( Perl_debug_log, "\n");
1636     }
1637 }
1638
1639 /*
1640   Dumps a fully constructed but uncompressed trie in table form.
1641   This is the normal DFA style state transition table, with a few
1642   twists to facilitate compression later.
1643   Used for debugging make_trie().
1644 */
1645 STATIC void
1646 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1647                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1648                           U32 depth)
1649 {
1650     U32 state;
1651     U16 charid;
1652     SV *sv=sv_newmortal();
1653     int colwidth= widecharmap ? 6 : 4;
1654     GET_RE_DEBUG_FLAGS_DECL;
1655
1656     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1657
1658     /*
1659        print out the table precompression so that we can do a visual check
1660        that they are identical.
1661      */
1662
1663     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1664
1665     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1666         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1667         if ( tmp ) {
1668             PerlIO_printf( Perl_debug_log, "%*s",
1669                 colwidth,
1670                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1671                             PL_colors[0], PL_colors[1],
1672                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1673                             PERL_PV_ESCAPE_FIRSTCHAR
1674                 )
1675             );
1676         }
1677     }
1678
1679     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1680
1681     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1682         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1683     }
1684
1685     PerlIO_printf( Perl_debug_log, "\n" );
1686
1687     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1688
1689         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1690             (int)depth * 2 + 2,"",
1691             (UV)TRIE_NODENUM( state ) );
1692
1693         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1694             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1695             if (v)
1696                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1697             else
1698                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1699         }
1700         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1701             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1702                                             (UV)trie->trans[ state ].check );
1703         } else {
1704             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1705                                             (UV)trie->trans[ state ].check,
1706             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1707         }
1708     }
1709 }
1710
1711 #endif
1712
1713
1714 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1715   startbranch: the first branch in the whole branch sequence
1716   first      : start branch of sequence of branch-exact nodes.
1717                May be the same as startbranch
1718   last       : Thing following the last branch.
1719                May be the same as tail.
1720   tail       : item following the branch sequence
1721   count      : words in the sequence
1722   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1723   depth      : indent depth
1724
1725 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1726
1727 A trie is an N'ary tree where the branches are determined by digital
1728 decomposition of the key. IE, at the root node you look up the 1st character and
1729 follow that branch repeat until you find the end of the branches. Nodes can be
1730 marked as "accepting" meaning they represent a complete word. Eg:
1731
1732   /he|she|his|hers/
1733
1734 would convert into the following structure. Numbers represent states, letters
1735 following numbers represent valid transitions on the letter from that state, if
1736 the number is in square brackets it represents an accepting state, otherwise it
1737 will be in parenthesis.
1738
1739       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1740       |    |
1741       |   (2)
1742       |    |
1743      (1)   +-i->(6)-+-s->[7]
1744       |
1745       +-s->(3)-+-h->(4)-+-e->[5]
1746
1747       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1748
1749 This shows that when matching against the string 'hers' we will begin at state 1
1750 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1751 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1752 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1753 single traverse. We store a mapping from accepting to state to which word was
1754 matched, and then when we have multiple possibilities we try to complete the
1755 rest of the regex in the order in which they occured in the alternation.
1756
1757 The only prior NFA like behaviour that would be changed by the TRIE support is
1758 the silent ignoring of duplicate alternations which are of the form:
1759
1760  / (DUPE|DUPE) X? (?{ ... }) Y /x
1761
1762 Thus EVAL blocks following a trie may be called a different number of times with
1763 and without the optimisation. With the optimisations dupes will be silently
1764 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1765 the following demonstrates:
1766
1767  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1768
1769 which prints out 'word' three times, but
1770
1771  'words'=~/(word|word|word)(?{ print $1 })S/
1772
1773 which doesnt print it out at all. This is due to other optimisations kicking in.
1774
1775 Example of what happens on a structural level:
1776
1777 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1778
1779    1: CURLYM[1] {1,32767}(18)
1780    5:   BRANCH(8)
1781    6:     EXACT <ac>(16)
1782    8:   BRANCH(11)
1783    9:     EXACT <ad>(16)
1784   11:   BRANCH(14)
1785   12:     EXACT <ab>(16)
1786   16:   SUCCEED(0)
1787   17:   NOTHING(18)
1788   18: END(0)
1789
1790 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1791 and should turn into:
1792
1793    1: CURLYM[1] {1,32767}(18)
1794    5:   TRIE(16)
1795         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1796           <ac>
1797           <ad>
1798           <ab>
1799   16:   SUCCEED(0)
1800   17:   NOTHING(18)
1801   18: END(0)
1802
1803 Cases where tail != last would be like /(?foo|bar)baz/:
1804
1805    1: BRANCH(4)
1806    2:   EXACT <foo>(8)
1807    4: BRANCH(7)
1808    5:   EXACT <bar>(8)
1809    7: TAIL(8)
1810    8: EXACT <baz>(10)
1811   10: END(0)
1812
1813 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1814 and would end up looking like:
1815
1816     1: TRIE(8)
1817       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1818         <foo>
1819         <bar>
1820    7: TAIL(8)
1821    8: EXACT <baz>(10)
1822   10: END(0)
1823
1824     d = uvchr_to_utf8_flags(d, uv, 0);
1825
1826 is the recommended Unicode-aware way of saying
1827
1828     *(d++) = uv;
1829 */
1830
1831 #define TRIE_STORE_REVCHAR(val)                                            \
1832     STMT_START {                                                           \
1833         if (UTF) {                                                         \
1834             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1835             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1836             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1837             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1838             SvPOK_on(zlopp);                                               \
1839             SvUTF8_on(zlopp);                                              \
1840             av_push(revcharmap, zlopp);                                    \
1841         } else {                                                           \
1842             char ooooff = (char)val;                                           \
1843             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1844         }                                                                  \
1845         } STMT_END
1846
1847 /* This gets the next character from the input, folding it if not already
1848  * folded. */
1849 #define TRIE_READ_CHAR STMT_START {                                           \
1850     wordlen++;                                                                \
1851     if ( UTF ) {                                                              \
1852         /* if it is UTF then it is either already folded, or does not need    \
1853          * folding */                                                         \
1854         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1855     }                                                                         \
1856     else if (folder == PL_fold_latin1) {                                      \
1857         /* This folder implies Unicode rules, which in the range expressible  \
1858          *  by not UTF is the lower case, with the two exceptions, one of     \
1859          *  which should have been taken care of before calling this */       \
1860         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1861         uvc = toLOWER_L1(*uc);                                                \
1862         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1863         len = 1;                                                              \
1864     } else {                                                                  \
1865         /* raw data, will be folded later if needed */                        \
1866         uvc = (U32)*uc;                                                       \
1867         len = 1;                                                              \
1868     }                                                                         \
1869 } STMT_END
1870
1871
1872
1873 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1874     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1875         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1876         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1877     }                                                           \
1878     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1879     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1880     TRIE_LIST_CUR( state )++;                                   \
1881 } STMT_END
1882
1883 #define TRIE_LIST_NEW(state) STMT_START {                       \
1884     Newxz( trie->states[ state ].trans.list,               \
1885         4, reg_trie_trans_le );                                 \
1886      TRIE_LIST_CUR( state ) = 1;                                \
1887      TRIE_LIST_LEN( state ) = 4;                                \
1888 } STMT_END
1889
1890 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1891     U16 dupe= trie->states[ state ].wordnum;                    \
1892     regnode * const noper_next = regnext( noper );              \
1893                                                                 \
1894     DEBUG_r({                                                   \
1895         /* store the word for dumping */                        \
1896         SV* tmp;                                                \
1897         if (OP(noper) != NOTHING)                               \
1898             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1899         else                                                    \
1900             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1901         av_push( trie_words, tmp );                             \
1902     });                                                         \
1903                                                                 \
1904     curword++;                                                  \
1905     trie->wordinfo[curword].prev   = 0;                         \
1906     trie->wordinfo[curword].len    = wordlen;                   \
1907     trie->wordinfo[curword].accept = state;                     \
1908                                                                 \
1909     if ( noper_next < tail ) {                                  \
1910         if (!trie->jump)                                        \
1911             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1912                                                  sizeof(U16) ); \
1913         trie->jump[curword] = (U16)(noper_next - convert);      \
1914         if (!jumper)                                            \
1915             jumper = noper_next;                                \
1916         if (!nextbranch)                                        \
1917             nextbranch= regnext(cur);                           \
1918     }                                                           \
1919                                                                 \
1920     if ( dupe ) {                                               \
1921         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1922         /* chain, so that when the bits of chain are later    */\
1923         /* linked together, the dups appear in the chain      */\
1924         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1925         trie->wordinfo[dupe].prev = curword;                    \
1926     } else {                                                    \
1927         /* we haven't inserted this word yet.                */ \
1928         trie->states[ state ].wordnum = curword;                \
1929     }                                                           \
1930 } STMT_END
1931
1932
1933 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1934      ( ( base + charid >=  ucharcount                                   \
1935          && base + charid < ubound                                      \
1936          && state == trie->trans[ base - ucharcount + charid ].check    \
1937          && trie->trans[ base - ucharcount + charid ].next )            \
1938            ? trie->trans[ base - ucharcount + charid ].next             \
1939            : ( state==1 ? special : 0 )                                 \
1940       )
1941
1942 #define MADE_TRIE       1
1943 #define MADE_JUMP_TRIE  2
1944 #define MADE_EXACT_TRIE 4
1945
1946 STATIC I32
1947 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1948                   regnode *first, regnode *last, regnode *tail,
1949                   U32 word_count, U32 flags, U32 depth)
1950 {
1951     dVAR;
1952     /* first pass, loop through and scan words */
1953     reg_trie_data *trie;
1954     HV *widecharmap = NULL;
1955     AV *revcharmap = newAV();
1956     regnode *cur;
1957     STRLEN len = 0;
1958     UV uvc = 0;
1959     U16 curword = 0;
1960     U32 next_alloc = 0;
1961     regnode *jumper = NULL;
1962     regnode *nextbranch = NULL;
1963     regnode *convert = NULL;
1964     U32 *prev_states; /* temp array mapping each state to previous one */
1965     /* we just use folder as a flag in utf8 */
1966     const U8 * folder = NULL;
1967
1968 #ifdef DEBUGGING
1969     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1970     AV *trie_words = NULL;
1971     /* along with revcharmap, this only used during construction but both are
1972      * useful during debugging so we store them in the struct when debugging.
1973      */
1974 #else
1975     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1976     STRLEN trie_charcount=0;
1977 #endif
1978     SV *re_trie_maxbuff;
1979     GET_RE_DEBUG_FLAGS_DECL;
1980
1981     PERL_ARGS_ASSERT_MAKE_TRIE;
1982 #ifndef DEBUGGING
1983     PERL_UNUSED_ARG(depth);
1984 #endif
1985
1986     switch (flags) {
1987         case EXACT: break;
1988         case EXACTFA:
1989         case EXACTFU_SS:
1990         case EXACTFU: folder = PL_fold_latin1; break;
1991         case EXACTF:  folder = PL_fold; break;
1992         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1993     }
1994
1995     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1996     trie->refcount = 1;
1997     trie->startstate = 1;
1998     trie->wordcount = word_count;
1999     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2000     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2001     if (flags == EXACT)
2002         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2003     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2004                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2005
2006     DEBUG_r({
2007         trie_words = newAV();
2008     });
2009
2010     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2011     assert(re_trie_maxbuff);
2012     if (!SvIOK(re_trie_maxbuff)) {
2013         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2014     }
2015     DEBUG_TRIE_COMPILE_r({
2016         PerlIO_printf( Perl_debug_log,
2017           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2018           (int)depth * 2 + 2, "",
2019           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2020           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2021     });
2022
2023    /* Find the node we are going to overwrite */
2024     if ( first == startbranch && OP( last ) != BRANCH ) {
2025         /* whole branch chain */
2026         convert = first;
2027     } else {
2028         /* branch sub-chain */
2029         convert = NEXTOPER( first );
2030     }
2031
2032     /*  -- First loop and Setup --
2033
2034        We first traverse the branches and scan each word to determine if it
2035        contains widechars, and how many unique chars there are, this is
2036        important as we have to build a table with at least as many columns as we
2037        have unique chars.
2038
2039        We use an array of integers to represent the character codes 0..255
2040        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2041        the native representation of the character value as the key and IV's for
2042        the coded index.
2043
2044        *TODO* If we keep track of how many times each character is used we can
2045        remap the columns so that the table compression later on is more
2046        efficient in terms of memory by ensuring the most common value is in the
2047        middle and the least common are on the outside.  IMO this would be better
2048        than a most to least common mapping as theres a decent chance the most
2049        common letter will share a node with the least common, meaning the node
2050        will not be compressible. With a middle is most common approach the worst
2051        case is when we have the least common nodes twice.
2052
2053      */
2054
2055     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2056         regnode *noper = NEXTOPER( cur );
2057         const U8 *uc = (U8*)STRING( noper );
2058         const U8 *e  = uc + STR_LEN( noper );
2059         int foldlen = 0;
2060         U32 wordlen      = 0;         /* required init */
2061         STRLEN minchars = 0;
2062         STRLEN maxchars = 0;
2063         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2064                                                bitmap?*/
2065
2066         if (OP(noper) == NOTHING) {
2067             regnode *noper_next= regnext(noper);
2068             if (noper_next != tail && OP(noper_next) == flags) {
2069                 noper = noper_next;
2070                 uc= (U8*)STRING(noper);
2071                 e= uc + STR_LEN(noper);
2072                 trie->minlen= STR_LEN(noper);
2073             } else {
2074                 trie->minlen= 0;
2075                 continue;
2076             }
2077         }
2078
2079         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2080             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2081                                           regardless of encoding */
2082             if (OP( noper ) == EXACTFU_SS) {
2083                 /* false positives are ok, so just set this */
2084                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2085             }
2086         }
2087         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2088                                            branch */
2089             TRIE_CHARCOUNT(trie)++;
2090             TRIE_READ_CHAR;
2091
2092             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2093              * is in effect.  Under /i, this character can match itself, or
2094              * anything that folds to it.  If not under /i, it can match just
2095              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2096              * all fold to k, and all are single characters.   But some folds
2097              * expand to more than one character, so for example LATIN SMALL
2098              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2099              * the string beginning at 'uc' is 'ffi', it could be matched by
2100              * three characters, or just by the one ligature character. (It
2101              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2102              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2103              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2104              * match.)  The trie needs to know the minimum and maximum number
2105              * of characters that could match so that it can use size alone to
2106              * quickly reject many match attempts.  The max is simple: it is
2107              * the number of folded characters in this branch (since a fold is
2108              * never shorter than what folds to it. */
2109
2110             maxchars++;
2111
2112             /* And the min is equal to the max if not under /i (indicated by
2113              * 'folder' being NULL), or there are no multi-character folds.  If
2114              * there is a multi-character fold, the min is incremented just
2115              * once, for the character that folds to the sequence.  Each
2116              * character in the sequence needs to be added to the list below of
2117              * characters in the trie, but we count only the first towards the
2118              * min number of characters needed.  This is done through the
2119              * variable 'foldlen', which is returned by the macros that look
2120              * for these sequences as the number of bytes the sequence
2121              * occupies.  Each time through the loop, we decrement 'foldlen' by
2122              * how many bytes the current char occupies.  Only when it reaches
2123              * 0 do we increment 'minchars' or look for another multi-character
2124              * sequence. */
2125             if (folder == NULL) {
2126                 minchars++;
2127             }
2128             else if (foldlen > 0) {
2129                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2130             }
2131             else {
2132                 minchars++;
2133
2134                 /* See if *uc is the beginning of a multi-character fold.  If
2135                  * so, we decrement the length remaining to look at, to account
2136                  * for the current character this iteration.  (We can use 'uc'
2137                  * instead of the fold returned by TRIE_READ_CHAR because for
2138                  * non-UTF, the latin1_safe macro is smart enough to account
2139                  * for all the unfolded characters, and because for UTF, the
2140                  * string will already have been folded earlier in the
2141                  * compilation process */
2142                 if (UTF) {
2143                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2144                         foldlen -= UTF8SKIP(uc);
2145                     }
2146                 }
2147                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2148                     foldlen--;
2149                 }
2150             }
2151
2152             /* The current character (and any potential folds) should be added
2153              * to the possible matching characters for this position in this
2154              * branch */
2155             if ( uvc < 256 ) {
2156                 if ( folder ) {
2157                     U8 folded= folder[ (U8) uvc ];
2158                     if ( !trie->charmap[ folded ] ) {
2159                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2160                         TRIE_STORE_REVCHAR( folded );
2161                     }
2162                 }
2163                 if ( !trie->charmap[ uvc ] ) {
2164                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2165                     TRIE_STORE_REVCHAR( uvc );
2166                 }
2167                 if ( set_bit ) {
2168                     /* store the codepoint in the bitmap, and its folded
2169                      * equivalent. */
2170                     TRIE_BITMAP_SET(trie, uvc);
2171
2172                     /* store the folded codepoint */
2173                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2174
2175                     if ( !UTF ) {
2176                         /* store first byte of utf8 representation of
2177                            variant codepoints */
2178                         if (! UVCHR_IS_INVARIANT(uvc)) {
2179                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2180                         }
2181                     }
2182                     set_bit = 0; /* We've done our bit :-) */
2183                 }
2184             } else {
2185
2186                 /* XXX We could come up with the list of code points that fold
2187                  * to this using PL_utf8_foldclosures, except not for
2188                  * multi-char folds, as there may be multiple combinations
2189                  * there that could work, which needs to wait until runtime to
2190                  * resolve (The comment about LIGATURE FFI above is such an
2191                  * example */
2192
2193                 SV** svpp;
2194                 if ( !widecharmap )
2195                     widecharmap = newHV();
2196
2197                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2198
2199                 if ( !svpp )
2200                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2201
2202                 if ( !SvTRUE( *svpp ) ) {
2203                     sv_setiv( *svpp, ++trie->uniquecharcount );
2204                     TRIE_STORE_REVCHAR(uvc);
2205                 }
2206             }
2207         } /* end loop through characters in this branch of the trie */
2208
2209         /* We take the min and max for this branch and combine to find the min
2210          * and max for all branches processed so far */
2211         if( cur == first ) {
2212             trie->minlen = minchars;
2213             trie->maxlen = maxchars;
2214         } else if (minchars < trie->minlen) {
2215             trie->minlen = minchars;
2216         } else if (maxchars > trie->maxlen) {
2217             trie->maxlen = maxchars;
2218         }
2219     } /* end first pass */
2220     DEBUG_TRIE_COMPILE_r(
2221         PerlIO_printf( Perl_debug_log,
2222                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2223                 (int)depth * 2 + 2,"",
2224                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2225                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2226                 (int)trie->minlen, (int)trie->maxlen )
2227     );
2228
2229     /*
2230         We now know what we are dealing with in terms of unique chars and
2231         string sizes so we can calculate how much memory a naive
2232         representation using a flat table  will take. If it's over a reasonable
2233         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2234         conservative but potentially much slower representation using an array
2235         of lists.
2236
2237         At the end we convert both representations into the same compressed
2238         form that will be used in regexec.c for matching with. The latter
2239         is a form that cannot be used to construct with but has memory
2240         properties similar to the list form and access properties similar
2241         to the table form making it both suitable for fast searches and
2242         small enough that its feasable to store for the duration of a program.
2243
2244         See the comment in the code where the compressed table is produced
2245         inplace from the flat tabe representation for an explanation of how
2246         the compression works.
2247
2248     */
2249
2250
2251     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2252     prev_states[1] = 0;
2253
2254     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2255                                                     > SvIV(re_trie_maxbuff) )
2256     {
2257         /*
2258             Second Pass -- Array Of Lists Representation
2259
2260             Each state will be represented by a list of charid:state records
2261             (reg_trie_trans_le) the first such element holds the CUR and LEN
2262             points of the allocated array. (See defines above).
2263
2264             We build the initial structure using the lists, and then convert
2265             it into the compressed table form which allows faster lookups
2266             (but cant be modified once converted).
2267         */
2268
2269         STRLEN transcount = 1;
2270
2271         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2272             "%*sCompiling trie using list compiler\n",
2273             (int)depth * 2 + 2, ""));
2274
2275         trie->states = (reg_trie_state *)
2276             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2277                                   sizeof(reg_trie_state) );
2278         TRIE_LIST_NEW(1);
2279         next_alloc = 2;
2280
2281         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2282
2283             regnode *noper   = NEXTOPER( cur );
2284             U8 *uc           = (U8*)STRING( noper );
2285             const U8 *e      = uc + STR_LEN( noper );
2286             U32 state        = 1;         /* required init */
2287             U16 charid       = 0;         /* sanity init */
2288             U32 wordlen      = 0;         /* required init */
2289
2290             if (OP(noper) == NOTHING) {
2291                 regnode *noper_next= regnext(noper);
2292                 if (noper_next != tail && OP(noper_next) == flags) {
2293                     noper = noper_next;
2294                     uc= (U8*)STRING(noper);
2295                     e= uc + STR_LEN(noper);
2296                 }
2297             }
2298
2299             if (OP(noper) != NOTHING) {
2300                 for ( ; uc < e ; uc += len ) {
2301
2302                     TRIE_READ_CHAR;
2303
2304                     if ( uvc < 256 ) {
2305                         charid = trie->charmap[ uvc ];
2306                     } else {
2307                         SV** const svpp = hv_fetch( widecharmap,
2308                                                     (char*)&uvc,
2309                                                     sizeof( UV ),
2310                                                     0);
2311                         if ( !svpp ) {
2312                             charid = 0;
2313                         } else {
2314                             charid=(U16)SvIV( *svpp );
2315                         }
2316                     }
2317                     /* charid is now 0 if we dont know the char read, or
2318                      * nonzero if we do */
2319                     if ( charid ) {
2320
2321                         U16 check;
2322                         U32 newstate = 0;
2323
2324                         charid--;
2325                         if ( !trie->states[ state ].trans.list ) {
2326                             TRIE_LIST_NEW( state );
2327                         }
2328                         for ( check = 1;
2329                               check <= TRIE_LIST_USED( state );
2330                               check++ )
2331                         {
2332                             if ( TRIE_LIST_ITEM( state, check ).forid
2333                                                                     == charid )
2334                             {
2335                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2336                                 break;
2337                             }
2338                         }
2339                         if ( ! newstate ) {
2340                             newstate = next_alloc++;
2341                             prev_states[newstate] = state;
2342                             TRIE_LIST_PUSH( state, charid, newstate );
2343                             transcount++;
2344                         }
2345                         state = newstate;
2346                     } else {
2347                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2348                     }
2349                 }
2350             }
2351             TRIE_HANDLE_WORD(state);
2352
2353         } /* end second pass */
2354
2355         /* next alloc is the NEXT state to be allocated */
2356         trie->statecount = next_alloc;
2357         trie->states = (reg_trie_state *)
2358             PerlMemShared_realloc( trie->states,
2359                                    next_alloc
2360                                    * sizeof(reg_trie_state) );
2361
2362         /* and now dump it out before we compress it */
2363         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2364                                                          revcharmap, next_alloc,
2365                                                          depth+1)
2366         );
2367
2368         trie->trans = (reg_trie_trans *)
2369             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2370         {
2371             U32 state;
2372             U32 tp = 0;
2373             U32 zp = 0;
2374
2375
2376             for( state=1 ; state < next_alloc ; state ++ ) {
2377                 U32 base=0;
2378
2379                 /*
2380                 DEBUG_TRIE_COMPILE_MORE_r(
2381                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2382                 );
2383                 */
2384
2385                 if (trie->states[state].trans.list) {
2386                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2387                     U16 maxid=minid;
2388                     U16 idx;
2389
2390                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2391                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2392                         if ( forid < minid ) {
2393                             minid=forid;
2394                         } else if ( forid > maxid ) {
2395                             maxid=forid;
2396                         }
2397                     }
2398                     if ( transcount < tp + maxid - minid + 1) {
2399                         transcount *= 2;
2400                         trie->trans = (reg_trie_trans *)
2401                             PerlMemShared_realloc( trie->trans,
2402                                                      transcount
2403                                                      * sizeof(reg_trie_trans) );
2404                         Zero( trie->trans + (transcount / 2),
2405                               transcount / 2,
2406                               reg_trie_trans );
2407                     }
2408                     base = trie->uniquecharcount + tp - minid;
2409                     if ( maxid == minid ) {
2410                         U32 set = 0;
2411                         for ( ; zp < tp ; zp++ ) {
2412                             if ( ! trie->trans[ zp ].next ) {
2413                                 base = trie->uniquecharcount + zp - minid;
2414                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2415                                                                    1).newstate;
2416                                 trie->trans[ zp ].check = state;
2417                                 set = 1;
2418                                 break;
2419                             }
2420                         }
2421                         if ( !set ) {
2422                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2423                                                                    1).newstate;
2424                             trie->trans[ tp ].check = state;
2425                             tp++;
2426                             zp = tp;
2427                         }
2428                     } else {
2429                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2430                             const U32 tid = base
2431                                            - trie->uniquecharcount
2432                                            + TRIE_LIST_ITEM( state, idx ).forid;
2433                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2434                                                                 idx ).newstate;
2435                             trie->trans[ tid ].check = state;
2436                         }
2437                         tp += ( maxid - minid + 1 );
2438                     }
2439                     Safefree(trie->states[ state ].trans.list);
2440                 }
2441                 /*
2442                 DEBUG_TRIE_COMPILE_MORE_r(
2443                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2444                 );
2445                 */
2446                 trie->states[ state ].trans.base=base;
2447             }
2448             trie->lasttrans = tp + 1;
2449         }
2450     } else {
2451         /*
2452            Second Pass -- Flat Table Representation.
2453
2454            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2455            each.  We know that we will need Charcount+1 trans at most to store
2456            the data (one row per char at worst case) So we preallocate both
2457            structures assuming worst case.
2458
2459            We then construct the trie using only the .next slots of the entry
2460            structs.
2461
2462            We use the .check field of the first entry of the node temporarily
2463            to make compression both faster and easier by keeping track of how
2464            many non zero fields are in the node.
2465
2466            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2467            transition.
2468
2469            There are two terms at use here: state as a TRIE_NODEIDX() which is
2470            a number representing the first entry of the node, and state as a
2471            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2472            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2473            if there are 2 entrys per node. eg:
2474
2475              A B       A B
2476           1. 2 4    1. 3 7
2477           2. 0 3    3. 0 5
2478           3. 0 0    5. 0 0
2479           4. 0 0    7. 0 0
2480
2481            The table is internally in the right hand, idx form. However as we
2482            also have to deal with the states array which is indexed by nodenum
2483            we have to use TRIE_NODENUM() to convert.
2484
2485         */
2486         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2487             "%*sCompiling trie using table compiler\n",
2488             (int)depth * 2 + 2, ""));
2489
2490         trie->trans = (reg_trie_trans *)
2491             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2492                                   * trie->uniquecharcount + 1,
2493                                   sizeof(reg_trie_trans) );
2494         trie->states = (reg_trie_state *)
2495             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2496                                   sizeof(reg_trie_state) );
2497         next_alloc = trie->uniquecharcount + 1;
2498
2499
2500         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2501
2502             regnode *noper   = NEXTOPER( cur );
2503             const U8 *uc     = (U8*)STRING( noper );
2504             const U8 *e      = uc + STR_LEN( noper );
2505
2506             U32 state        = 1;         /* required init */
2507
2508             U16 charid       = 0;         /* sanity init */
2509             U32 accept_state = 0;         /* sanity init */
2510
2511             U32 wordlen      = 0;         /* required init */
2512
2513             if (OP(noper) == NOTHING) {
2514                 regnode *noper_next= regnext(noper);
2515                 if (noper_next != tail && OP(noper_next) == flags) {
2516                     noper = noper_next;
2517                     uc= (U8*)STRING(noper);
2518                     e= uc + STR_LEN(noper);
2519                 }
2520             }
2521
2522             if ( OP(noper) != NOTHING ) {
2523                 for ( ; uc < e ; uc += len ) {
2524
2525                     TRIE_READ_CHAR;
2526
2527                     if ( uvc < 256 ) {
2528                         charid = trie->charmap[ uvc ];
2529                     } else {
2530                         SV* const * const svpp = hv_fetch( widecharmap,
2531                                                            (char*)&uvc,
2532                                                            sizeof( UV ),
2533                                                            0);
2534                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2535                     }
2536                     if ( charid ) {
2537                         charid--;
2538                         if ( !trie->trans[ state + charid ].next ) {
2539                             trie->trans[ state + charid ].next = next_alloc;
2540                             trie->trans[ state ].check++;
2541                             prev_states[TRIE_NODENUM(next_alloc)]
2542                                     = TRIE_NODENUM(state);
2543                             next_alloc += trie->uniquecharcount;
2544                         }
2545                         state = trie->trans[ state + charid ].next;
2546                     } else {
2547                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2548                     }
2549                     /* charid is now 0 if we dont know the char read, or
2550                      * nonzero if we do */
2551                 }
2552             }
2553             accept_state = TRIE_NODENUM( state );
2554             TRIE_HANDLE_WORD(accept_state);
2555
2556         } /* end second pass */
2557
2558         /* and now dump it out before we compress it */
2559         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2560                                                           revcharmap,
2561                                                           next_alloc, depth+1));
2562
2563         {
2564         /*
2565            * Inplace compress the table.*
2566
2567            For sparse data sets the table constructed by the trie algorithm will
2568            be mostly 0/FAIL transitions or to put it another way mostly empty.
2569            (Note that leaf nodes will not contain any transitions.)
2570
2571            This algorithm compresses the tables by eliminating most such
2572            transitions, at the cost of a modest bit of extra work during lookup:
2573
2574            - Each states[] entry contains a .base field which indicates the
2575            index in the state[] array wheres its transition data is stored.
2576
2577            - If .base is 0 there are no valid transitions from that node.
2578
2579            - If .base is nonzero then charid is added to it to find an entry in
2580            the trans array.
2581
2582            -If trans[states[state].base+charid].check!=state then the
2583            transition is taken to be a 0/Fail transition. Thus if there are fail
2584            transitions at the front of the node then the .base offset will point
2585            somewhere inside the previous nodes data (or maybe even into a node
2586            even earlier), but the .check field determines if the transition is
2587            valid.
2588
2589            XXX - wrong maybe?
2590            The following process inplace converts the table to the compressed
2591            table: We first do not compress the root node 1,and mark all its
2592            .check pointers as 1 and set its .base pointer as 1 as well. This
2593            allows us to do a DFA construction from the compressed table later,
2594            and ensures that any .base pointers we calculate later are greater
2595            than 0.
2596
2597            - We set 'pos' to indicate the first entry of the second node.
2598
2599            - We then iterate over the columns of the node, finding the first and
2600            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2601            and set the .check pointers accordingly, and advance pos
2602            appropriately and repreat for the next node. Note that when we copy
2603            the next pointers we have to convert them from the original
2604            NODEIDX form to NODENUM form as the former is not valid post
2605            compression.
2606
2607            - If a node has no transitions used we mark its base as 0 and do not
2608            advance the pos pointer.
2609
2610            - If a node only has one transition we use a second pointer into the
2611            structure to fill in allocated fail transitions from other states.
2612            This pointer is independent of the main pointer and scans forward
2613            looking for null transitions that are allocated to a state. When it
2614            finds one it writes the single transition into the "hole".  If the
2615            pointer doesnt find one the single transition is appended as normal.
2616
2617            - Once compressed we can Renew/realloc the structures to release the
2618            excess space.
2619
2620            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2621            specifically Fig 3.47 and the associated pseudocode.
2622
2623            demq
2624         */
2625         const U32 laststate = TRIE_NODENUM( next_alloc );
2626         U32 state, charid;
2627         U32 pos = 0, zp=0;
2628         trie->statecount = laststate;
2629
2630         for ( state = 1 ; state < laststate ; state++ ) {
2631             U8 flag = 0;
2632             const U32 stateidx = TRIE_NODEIDX( state );
2633             const U32 o_used = trie->trans[ stateidx ].check;
2634             U32 used = trie->trans[ stateidx ].check;
2635             trie->trans[ stateidx ].check = 0;
2636
2637             for ( charid = 0;
2638                   used && charid < trie->uniquecharcount;
2639                   charid++ )
2640             {
2641                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2642                     if ( trie->trans[ stateidx + charid ].next ) {
2643                         if (o_used == 1) {
2644                             for ( ; zp < pos ; zp++ ) {
2645                                 if ( ! trie->trans[ zp ].next ) {
2646                                     break;
2647                                 }
2648                             }
2649                             trie->states[ state ].trans.base
2650                                                     = zp
2651                                                       + trie->uniquecharcount
2652                                                       - charid ;
2653                             trie->trans[ zp ].next
2654                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2655                                                              + charid ].next );
2656                             trie->trans[ zp ].check = state;
2657                             if ( ++zp > pos ) pos = zp;
2658                             break;
2659                         }
2660                         used--;
2661                     }
2662                     if ( !flag ) {
2663                         flag = 1;
2664                         trie->states[ state ].trans.base
2665                                        = pos + trie->uniquecharcount - charid ;
2666                     }
2667                     trie->trans[ pos ].next
2668                         = SAFE_TRIE_NODENUM(
2669                                        trie->trans[ stateidx + charid ].next );
2670                     trie->trans[ pos ].check = state;
2671                     pos++;
2672                 }
2673             }
2674         }
2675         trie->lasttrans = pos + 1;
2676         trie->states = (reg_trie_state *)
2677             PerlMemShared_realloc( trie->states, laststate
2678                                    * sizeof(reg_trie_state) );
2679         DEBUG_TRIE_COMPILE_MORE_r(
2680             PerlIO_printf( Perl_debug_log,
2681                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2682                 (int)depth * 2 + 2,"",
2683                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2684                        + 1 ),
2685                 (IV)next_alloc,
2686                 (IV)pos,
2687                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2688             );
2689
2690         } /* end table compress */
2691     }
2692     DEBUG_TRIE_COMPILE_MORE_r(
2693             PerlIO_printf(Perl_debug_log,
2694                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2695                 (int)depth * 2 + 2, "",
2696                 (UV)trie->statecount,
2697                 (UV)trie->lasttrans)
2698     );
2699     /* resize the trans array to remove unused space */
2700     trie->trans = (reg_trie_trans *)
2701         PerlMemShared_realloc( trie->trans, trie->lasttrans
2702                                * sizeof(reg_trie_trans) );
2703
2704     {   /* Modify the program and insert the new TRIE node */
2705         U8 nodetype =(U8)(flags & 0xFF);
2706         char *str=NULL;
2707
2708 #ifdef DEBUGGING
2709         regnode *optimize = NULL;
2710 #ifdef RE_TRACK_PATTERN_OFFSETS
2711
2712         U32 mjd_offset = 0;
2713         U32 mjd_nodelen = 0;
2714 #endif /* RE_TRACK_PATTERN_OFFSETS */
2715 #endif /* DEBUGGING */
2716         /*
2717            This means we convert either the first branch or the first Exact,
2718            depending on whether the thing following (in 'last') is a branch
2719            or not and whther first is the startbranch (ie is it a sub part of
2720            the alternation or is it the whole thing.)
2721            Assuming its a sub part we convert the EXACT otherwise we convert
2722            the whole branch sequence, including the first.
2723          */
2724         /* Find the node we are going to overwrite */
2725         if ( first != startbranch || OP( last ) == BRANCH ) {
2726             /* branch sub-chain */
2727             NEXT_OFF( first ) = (U16)(last - first);
2728 #ifdef RE_TRACK_PATTERN_OFFSETS
2729             DEBUG_r({
2730                 mjd_offset= Node_Offset((convert));
2731                 mjd_nodelen= Node_Length((convert));
2732             });
2733 #endif
2734             /* whole branch chain */
2735         }
2736 #ifdef RE_TRACK_PATTERN_OFFSETS
2737         else {
2738             DEBUG_r({
2739                 const  regnode *nop = NEXTOPER( convert );
2740                 mjd_offset= Node_Offset((nop));
2741                 mjd_nodelen= Node_Length((nop));
2742             });
2743         }
2744         DEBUG_OPTIMISE_r(
2745             PerlIO_printf(Perl_debug_log,
2746                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2747                 (int)depth * 2 + 2, "",
2748                 (UV)mjd_offset, (UV)mjd_nodelen)
2749         );
2750 #endif
2751         /* But first we check to see if there is a common prefix we can
2752            split out as an EXACT and put in front of the TRIE node.  */
2753         trie->startstate= 1;
2754         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2755             U32 state;
2756             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2757                 U32 ofs = 0;
2758                 I32 idx = -1;
2759                 U32 count = 0;
2760                 const U32 base = trie->states[ state ].trans.base;
2761
2762                 if ( trie->states[state].wordnum )
2763                         count = 1;
2764
2765                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2766                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2767                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2768                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2769                     {
2770                         if ( ++count > 1 ) {
2771                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2772                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2773                             if ( state == 1 ) break;
2774                             if ( count == 2 ) {
2775                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2776                                 DEBUG_OPTIMISE_r(
2777                                     PerlIO_printf(Perl_debug_log,
2778                                         "%*sNew Start State=%"UVuf" Class: [",
2779                                         (int)depth * 2 + 2, "",
2780                                         (UV)state));
2781                                 if (idx >= 0) {
2782                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2783                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2784
2785                                     TRIE_BITMAP_SET(trie,*ch);
2786                                     if ( folder )
2787                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2788                                     DEBUG_OPTIMISE_r(
2789                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2790                                     );
2791                                 }
2792                             }
2793                             TRIE_BITMAP_SET(trie,*ch);
2794                             if ( folder )
2795                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2796                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2797                         }
2798                         idx = ofs;
2799                     }
2800                 }
2801                 if ( count == 1 ) {
2802                     SV **tmp = av_fetch( revcharmap, idx, 0);
2803                     STRLEN len;
2804                     char *ch = SvPV( *tmp, len );
2805                     DEBUG_OPTIMISE_r({
2806                         SV *sv=sv_newmortal();
2807                         PerlIO_printf( Perl_debug_log,
2808                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2809                             (int)depth * 2 + 2, "",
2810                             (UV)state, (UV)idx,
2811                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2812                                 PL_colors[0], PL_colors[1],
2813                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2814                                 PERL_PV_ESCAPE_FIRSTCHAR
2815                             )
2816                         );
2817                     });
2818                     if ( state==1 ) {
2819                         OP( convert ) = nodetype;
2820                         str=STRING(convert);
2821                         STR_LEN(convert)=0;
2822                     }
2823                     STR_LEN(convert) += len;
2824                     while (len--)
2825                         *str++ = *ch++;
2826                 } else {
2827 #ifdef DEBUGGING
2828                     if (state>1)
2829                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2830 #endif
2831                     break;
2832                 }
2833             }
2834             trie->prefixlen = (state-1);
2835             if (str) {
2836                 regnode *n = convert+NODE_SZ_STR(convert);
2837                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2838                 trie->startstate = state;
2839                 trie->minlen -= (state - 1);
2840                 trie->maxlen -= (state - 1);
2841 #ifdef DEBUGGING
2842                /* At least the UNICOS C compiler choked on this
2843                 * being argument to DEBUG_r(), so let's just have
2844                 * it right here. */
2845                if (
2846 #ifdef PERL_EXT_RE_BUILD
2847                    1
2848 #else
2849                    DEBUG_r_TEST
2850 #endif
2851                    ) {
2852                    regnode *fix = convert;
2853                    U32 word = trie->wordcount;
2854                    mjd_nodelen++;
2855                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2856                    while( ++fix < n ) {
2857                        Set_Node_Offset_Length(fix, 0, 0);
2858                    }
2859                    while (word--) {
2860                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2861                        if (tmp) {
2862                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2863                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2864                            else
2865                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2866                        }
2867                    }
2868                }
2869 #endif
2870                 if (trie->maxlen) {
2871                     convert = n;
2872                 } else {
2873                     NEXT_OFF(convert) = (U16)(tail - convert);
2874                     DEBUG_r(optimize= n);
2875                 }
2876             }
2877         }
2878         if (!jumper)
2879             jumper = last;
2880         if ( trie->maxlen ) {
2881             NEXT_OFF( convert ) = (U16)(tail - convert);
2882             ARG_SET( convert, data_slot );
2883             /* Store the offset to the first unabsorbed branch in
2884                jump[0], which is otherwise unused by the jump logic.
2885                We use this when dumping a trie and during optimisation. */
2886             if (trie->jump)
2887                 trie->jump[0] = (U16)(nextbranch - convert);
2888
2889             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2890              *   and there is a bitmap
2891              *   and the first "jump target" node we found leaves enough room
2892              * then convert the TRIE node into a TRIEC node, with the bitmap
2893              * embedded inline in the opcode - this is hypothetically faster.
2894              */
2895             if ( !trie->states[trie->startstate].wordnum
2896                  && trie->bitmap
2897                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2898             {
2899                 OP( convert ) = TRIEC;
2900                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2901                 PerlMemShared_free(trie->bitmap);
2902                 trie->bitmap= NULL;
2903             } else
2904                 OP( convert ) = TRIE;
2905
2906             /* store the type in the flags */
2907             convert->flags = nodetype;
2908             DEBUG_r({
2909             optimize = convert
2910                       + NODE_STEP_REGNODE
2911                       + regarglen[ OP( convert ) ];
2912             });
2913             /* XXX We really should free up the resource in trie now,
2914                    as we won't use them - (which resources?) dmq */
2915         }
2916         /* needed for dumping*/
2917         DEBUG_r(if (optimize) {
2918             regnode *opt = convert;
2919
2920             while ( ++opt < optimize) {
2921                 Set_Node_Offset_Length(opt,0,0);
2922             }
2923             /*
2924                 Try to clean up some of the debris left after the
2925                 optimisation.
2926              */
2927             while( optimize < jumper ) {
2928                 mjd_nodelen += Node_Length((optimize));
2929                 OP( optimize ) = OPTIMIZED;
2930                 Set_Node_Offset_Length(optimize,0,0);
2931                 optimize++;
2932             }
2933             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2934         });
2935     } /* end node insert */
2936
2937     /*  Finish populating the prev field of the wordinfo array.  Walk back
2938      *  from each accept state until we find another accept state, and if
2939      *  so, point the first word's .prev field at the second word. If the
2940      *  second already has a .prev field set, stop now. This will be the
2941      *  case either if we've already processed that word's accept state,
2942      *  or that state had multiple words, and the overspill words were
2943      *  already linked up earlier.
2944      */
2945     {
2946         U16 word;
2947         U32 state;
2948         U16 prev;
2949
2950         for (word=1; word <= trie->wordcount; word++) {
2951             prev = 0;
2952             if (trie->wordinfo[word].prev)
2953                 continue;
2954             state = trie->wordinfo[word].accept;
2955             while (state) {
2956                 state = prev_states[state];
2957                 if (!state)
2958                     break;
2959                 prev = trie->states[state].wordnum;
2960                 if (prev)
2961                     break;
2962             }
2963             trie->wordinfo[word].prev = prev;
2964         }
2965         Safefree(prev_states);
2966     }
2967
2968
2969     /* and now dump out the compressed format */
2970     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2971
2972     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2973 #ifdef DEBUGGING
2974     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2975     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2976 #else
2977     SvREFCNT_dec_NN(revcharmap);
2978 #endif
2979     return trie->jump
2980            ? MADE_JUMP_TRIE
2981            : trie->startstate>1
2982              ? MADE_EXACT_TRIE
2983              : MADE_TRIE;
2984 }
2985
2986 STATIC regnode *
2987 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2988 {
2989 /* The Trie is constructed and compressed now so we can build a fail array if
2990  * it's needed
2991
2992    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2993    3.32 in the
2994    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2995    Ullman 1985/88
2996    ISBN 0-201-10088-6
2997
2998    We find the fail state for each state in the trie, this state is the longest
2999    proper suffix of the current state's 'word' that is also a proper prefix of
3000    another word in our trie. State 1 represents the word '' and is thus the
3001    default fail state. This allows the DFA not to have to restart after its
3002    tried and failed a word at a given point, it simply continues as though it
3003    had been matching the other word in the first place.
3004    Consider
3005       'abcdgu'=~/abcdefg|cdgu/
3006    When we get to 'd' we are still matching the first word, we would encounter
3007    'g' which would fail, which would bring us to the state representing 'd' in
3008    the second word where we would try 'g' and succeed, proceeding to match
3009    'cdgu'.
3010  */
3011  /* add a fail transition */
3012     const U32 trie_offset = ARG(source);
3013     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3014     U32 *q;
3015     const U32 ucharcount = trie->uniquecharcount;
3016     const U32 numstates = trie->statecount;
3017     const U32 ubound = trie->lasttrans + ucharcount;
3018     U32 q_read = 0;
3019     U32 q_write = 0;
3020     U32 charid;
3021     U32 base = trie->states[ 1 ].trans.base;
3022     U32 *fail;
3023     reg_ac_data *aho;
3024     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3025     regnode *stclass;
3026     GET_RE_DEBUG_FLAGS_DECL;
3027
3028     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3029     PERL_UNUSED_CONTEXT;
3030 #ifndef DEBUGGING
3031     PERL_UNUSED_ARG(depth);
3032 #endif
3033
3034     if ( OP(source) == TRIE ) {
3035         struct regnode_1 *op = (struct regnode_1 *)
3036             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3037         StructCopy(source,op,struct regnode_1);
3038         stclass = (regnode *)op;
3039     } else {
3040         struct regnode_charclass *op = (struct regnode_charclass *)
3041             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3042         StructCopy(source,op,struct regnode_charclass);
3043         stclass = (regnode *)op;
3044     }
3045     OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3046
3047     ARG_SET( stclass, data_slot );
3048     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3049     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3050     aho->trie=trie_offset;
3051     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3052     Copy( trie->states, aho->states, numstates, reg_trie_state );
3053     Newxz( q, numstates, U32);
3054     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3055     aho->refcount = 1;
3056     fail = aho->fail;
3057     /* initialize fail[0..1] to be 1 so that we always have
3058        a valid final fail state */
3059     fail[ 0 ] = fail[ 1 ] = 1;
3060
3061     for ( charid = 0; charid < ucharcount ; charid++ ) {
3062         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3063         if ( newstate ) {
3064             q[ q_write ] = newstate;
3065             /* set to point at the root */
3066             fail[ q[ q_write++ ] ]=1;
3067         }
3068     }
3069     while ( q_read < q_write) {
3070         const U32 cur = q[ q_read++ % numstates ];
3071         base = trie->states[ cur ].trans.base;
3072
3073         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3074             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3075             if (ch_state) {
3076                 U32 fail_state = cur;
3077                 U32 fail_base;
3078                 do {
3079                     fail_state = fail[ fail_state ];
3080                     fail_base = aho->states[ fail_state ].trans.base;
3081                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3082
3083                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3084                 fail[ ch_state ] = fail_state;
3085                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3086                 {
3087                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3088                 }
3089                 q[ q_write++ % numstates] = ch_state;
3090             }
3091         }
3092     }
3093     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3094        when we fail in state 1, this allows us to use the
3095        charclass scan to find a valid start char. This is based on the principle
3096        that theres a good chance the string being searched contains lots of stuff
3097        that cant be a start char.
3098      */
3099     fail[ 0 ] = fail[ 1 ] = 0;
3100     DEBUG_TRIE_COMPILE_r({
3101         PerlIO_printf(Perl_debug_log,
3102                       "%*sStclass Failtable (%"UVuf" states): 0",
3103                       (int)(depth * 2), "", (UV)numstates
3104         );
3105         for( q_read=1; q_read<numstates; q_read++ ) {
3106             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3107         }
3108         PerlIO_printf(Perl_debug_log, "\n");
3109     });
3110     Safefree(q);
3111     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3112     return stclass;
3113 }
3114
3115
3116 #define DEBUG_PEEP(str,scan,depth) \
3117     DEBUG_OPTIMISE_r({if (scan){ \
3118        SV * const mysv=sv_newmortal(); \
3119        regnode *Next = regnext(scan); \
3120        regprop(RExC_rx, mysv, scan, NULL); \
3121        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3122        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3123        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3124    }});
3125
3126
3127 /* The below joins as many adjacent EXACTish nodes as possible into a single
3128  * one.  The regop may be changed if the node(s) contain certain sequences that
3129  * require special handling.  The joining is only done if:
3130  * 1) there is room in the current conglomerated node to entirely contain the
3131  *    next one.
3132  * 2) they are the exact same node type
3133  *
3134  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3135  * these get optimized out
3136  *
3137  * If a node is to match under /i (folded), the number of characters it matches
3138  * can be different than its character length if it contains a multi-character
3139  * fold.  *min_subtract is set to the total delta number of characters of the
3140  * input nodes.
3141  *
3142  * And *unfolded_multi_char is set to indicate whether or not the node contains
3143  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3144  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3145  * SMALL LETTER SHARP S, as only if the target string being matched against
3146  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3147  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3148  * whose components are all above the Latin1 range are not run-time locale
3149  * dependent, and have already been folded by the time this function is
3150  * called.)
3151  *
3152  * This is as good a place as any to discuss the design of handling these
3153  * multi-character fold sequences.  It's been wrong in Perl for a very long
3154  * time.  There are three code points in Unicode whose multi-character folds
3155  * were long ago discovered to mess things up.  The previous designs for
3156  * dealing with these involved assigning a special node for them.  This
3157  * approach doesn't always work, as evidenced by this example:
3158  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3159  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3160  * would match just the \xDF, it won't be able to handle the case where a
3161  * successful match would have to cross the node's boundary.  The new approach
3162  * that hopefully generally solves the problem generates an EXACTFU_SS node
3163  * that is "sss" in this case.
3164  *
3165  * It turns out that there are problems with all multi-character folds, and not
3166  * just these three.  Now the code is general, for all such cases.  The
3167  * approach taken is:
3168  * 1)   This routine examines each EXACTFish node that could contain multi-
3169  *      character folded sequences.  Since a single character can fold into
3170  *      such a sequence, the minimum match length for this node is less than
3171  *      the number of characters in the node.  This routine returns in
3172  *      *min_subtract how many characters to subtract from the the actual
3173  *      length of the string to get a real minimum match length; it is 0 if
3174  *      there are no multi-char foldeds.  This delta is used by the caller to
3175  *      adjust the min length of the match, and the delta between min and max,
3176  *      so that the optimizer doesn't reject these possibilities based on size
3177  *      constraints.
3178  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3179  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3180  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3181  *      there is a possible fold length change.  That means that a regular
3182  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3183  *      with length changes, and so can be processed faster.  regexec.c takes
3184  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3185  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3186  *      known until runtime).  This saves effort in regex matching.  However,
3187  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3188  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3189  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3190  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3191  *      possibilities for the non-UTF8 patterns are quite simple, except for
3192  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3193  *      members of a fold-pair, and arrays are set up for all of them so that
3194  *      the other member of the pair can be found quickly.  Code elsewhere in
3195  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3196  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3197  *      described in the next item.
3198  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3199  *      validity of the fold won't be known until runtime, and so must remain
3200  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3201  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3202  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3203  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3204  *      The reason this is a problem is that the optimizer part of regexec.c
3205  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3206  *      that a character in the pattern corresponds to at most a single
3207  *      character in the target string.  (And I do mean character, and not byte
3208  *      here, unlike other parts of the documentation that have never been
3209  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3210  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3211  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3212  *      nodes, violate the assumption, and they are the only instances where it
3213  *      is violated.  I'm reluctant to try to change the assumption, as the
3214  *      code involved is impenetrable to me (khw), so instead the code here
3215  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3216  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3217  *      boolean indicating whether or not the node contains such a fold.  When
3218  *      it is true, the caller sets a flag that later causes the optimizer in
3219  *      this file to not set values for the floating and fixed string lengths,
3220  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3221  *      assumption.  Thus, there is no optimization based on string lengths for
3222  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3223  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3224  *      assumption is wrong only in these cases is that all other non-UTF-8
3225  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3226  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3227  *      EXACTF nodes because we don't know at compile time if it actually
3228  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3229  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3230  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3231  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3232  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3233  *      string would require the pattern to be forced into UTF-8, the overhead
3234  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3235  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3236  *      locale.)
3237  *
3238  *      Similarly, the code that generates tries doesn't currently handle
3239  *      not-already-folded multi-char folds, and it looks like a pain to change
3240  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3241  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3242  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3243  *      using /iaa matching will be doing so almost entirely with ASCII
3244  *      strings, so this should rarely be encountered in practice */
3245
3246 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3247     if (PL_regkind[OP(scan)] == EXACT) \
3248         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3249
3250 STATIC U32
3251 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3252                    UV *min_subtract, bool *unfolded_multi_char,
3253                    U32 flags,regnode *val, U32 depth)
3254 {
3255     /* Merge several consecutive EXACTish nodes into one. */
3256     regnode *n = regnext(scan);
3257     U32 stringok = 1;
3258     regnode *next = scan + NODE_SZ_STR(scan);
3259     U32 merged = 0;
3260     U32 stopnow = 0;
3261 #ifdef DEBUGGING
3262     regnode *stop = scan;
3263     GET_RE_DEBUG_FLAGS_DECL;
3264 #else
3265     PERL_UNUSED_ARG(depth);
3266 #endif
3267
3268     PERL_ARGS_ASSERT_JOIN_EXACT;
3269 #ifndef EXPERIMENTAL_INPLACESCAN
3270     PERL_UNUSED_ARG(flags);
3271     PERL_UNUSED_ARG(val);
3272 #endif
3273     DEBUG_PEEP("join",scan,depth);
3274
3275     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3276      * EXACT ones that are mergeable to the current one. */
3277     while (n
3278            && (PL_regkind[OP(n)] == NOTHING
3279                || (stringok && OP(n) == OP(scan)))
3280            && NEXT_OFF(n)
3281            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3282     {
3283
3284         if (OP(n) == TAIL || n > next)
3285             stringok = 0;
3286         if (PL_regkind[OP(n)] == NOTHING) {
3287             DEBUG_PEEP("skip:",n,depth);
3288             NEXT_OFF(scan) += NEXT_OFF(n);
3289             next = n + NODE_STEP_REGNODE;
3290 #ifdef DEBUGGING
3291             if (stringok)
3292                 stop = n;
3293 #endif
3294             n = regnext(n);
3295         }
3296         else if (stringok) {
3297             const unsigned int oldl = STR_LEN(scan);
3298             regnode * const nnext = regnext(n);
3299
3300             /* XXX I (khw) kind of doubt that this works on platforms (should
3301              * Perl ever run on one) where U8_MAX is above 255 because of lots
3302              * of other assumptions */
3303             /* Don't join if the sum can't fit into a single node */
3304             if (oldl + STR_LEN(n) > U8_MAX)
3305                 break;
3306
3307             DEBUG_PEEP("merg",n,depth);
3308             merged++;
3309
3310             NEXT_OFF(scan) += NEXT_OFF(n);
3311             STR_LEN(scan) += STR_LEN(n);
3312             next = n + NODE_SZ_STR(n);
3313             /* Now we can overwrite *n : */
3314             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3315 #ifdef DEBUGGING
3316             stop = next - 1;
3317 #endif
3318             n = nnext;
3319             if (stopnow) break;
3320         }
3321
3322 #ifdef EXPERIMENTAL_INPLACESCAN
3323         if (flags && !NEXT_OFF(n)) {
3324             DEBUG_PEEP("atch", val, depth);
3325             if (reg_off_by_arg[OP(n)]) {
3326                 ARG_SET(n, val - n);
3327             }
3328             else {
3329                 NEXT_OFF(n) = val - n;
3330             }
3331             stopnow = 1;
3332         }
3333 #endif
3334     }
3335
3336     *min_subtract = 0;
3337     *unfolded_multi_char = FALSE;
3338
3339     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3340      * can now analyze for sequences of problematic code points.  (Prior to
3341      * this final joining, sequences could have been split over boundaries, and
3342      * hence missed).  The sequences only happen in folding, hence for any
3343      * non-EXACT EXACTish node */
3344     if (OP(scan) != EXACT) {
3345         U8* s0 = (U8*) STRING(scan);
3346         U8* s = s0;
3347         U8* s_end = s0 + STR_LEN(scan);
3348
3349         int total_count_delta = 0;  /* Total delta number of characters that
3350                                        multi-char folds expand to */
3351
3352         /* One pass is made over the node's string looking for all the
3353          * possibilities.  To avoid some tests in the loop, there are two main
3354          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3355          * non-UTF-8 */
3356         if (UTF) {
3357             U8* folded = NULL;
3358
3359             if (OP(scan) == EXACTFL) {
3360                 U8 *d;
3361
3362                 /* An EXACTFL node would already have been changed to another
3363                  * node type unless there is at least one character in it that
3364                  * is problematic; likely a character whose fold definition
3365                  * won't be known until runtime, and so has yet to be folded.
3366                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3367                  * to handle the UTF-8 case, we need to create a temporary
3368                  * folded copy using UTF-8 locale rules in order to analyze it.
3369                  * This is because our macros that look to see if a sequence is
3370                  * a multi-char fold assume everything is folded (otherwise the
3371                  * tests in those macros would be too complicated and slow).
3372                  * Note that here, the non-problematic folds will have already
3373                  * been done, so we can just copy such characters.  We actually
3374                  * don't completely fold the EXACTFL string.  We skip the
3375                  * unfolded multi-char folds, as that would just create work
3376                  * below to figure out the size they already are */
3377
3378                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3379                 d = folded;
3380                 while (s < s_end) {
3381                     STRLEN s_len = UTF8SKIP(s);
3382                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3383                         Copy(s, d, s_len, U8);
3384                         d += s_len;
3385                     }
3386                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3387                         *unfolded_multi_char = TRUE;
3388                         Copy(s, d, s_len, U8);
3389                         d += s_len;
3390                     }
3391                     else if (isASCII(*s)) {
3392                         *(d++) = toFOLD(*s);
3393                     }
3394                     else {
3395                         STRLEN len;
3396                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3397                         d += len;
3398                     }
3399                     s += s_len;
3400                 }
3401
3402                 /* Point the remainder of the routine to look at our temporary
3403                  * folded copy */
3404                 s = folded;
3405                 s_end = d;
3406             } /* End of creating folded copy of EXACTFL string */
3407
3408             /* Examine the string for a multi-character fold sequence.  UTF-8
3409              * patterns have all characters pre-folded by the time this code is
3410              * executed */
3411             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3412                                      length sequence we are looking for is 2 */
3413             {
3414                 int count = 0;  /* How many characters in a multi-char fold */
3415                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3416                 if (! len) {    /* Not a multi-char fold: get next char */
3417                     s += UTF8SKIP(s);
3418                     continue;
3419                 }
3420
3421                 /* Nodes with 'ss' require special handling, except for
3422                  * EXACTFA-ish for which there is no multi-char fold to this */
3423                 if (len == 2 && *s == 's' && *(s+1) == 's'
3424                     && OP(scan) != EXACTFA
3425                     && OP(scan) != EXACTFA_NO_TRIE)
3426                 {
3427                     count = 2;
3428                     if (OP(scan) != EXACTFL) {
3429                         OP(scan) = EXACTFU_SS;
3430                     }
3431                     s += 2;
3432                 }
3433                 else { /* Here is a generic multi-char fold. */
3434                     U8* multi_end  = s + len;
3435
3436                     /* Count how many characters are in it.  In the case of
3437                      * /aa, no folds which contain ASCII code points are
3438                      * allowed, so check for those, and skip if found. */
3439                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3440                         count = utf8_length(s, multi_end);
3441                         s = multi_end;
3442                     }
3443                     else {
3444                         while (s < multi_end) {
3445                             if (isASCII(*s)) {
3446                                 s++;
3447                                 goto next_iteration;
3448                             }
3449                             else {
3450                                 s += UTF8SKIP(s);
3451                             }
3452                             count++;
3453                         }
3454                     }
3455                 }
3456
3457                 /* The delta is how long the sequence is minus 1 (1 is how long
3458                  * the character that folds to the sequence is) */
3459                 total_count_delta += count - 1;
3460               next_iteration: ;
3461             }
3462
3463             /* We created a temporary folded copy of the string in EXACTFL
3464              * nodes.  Therefore we need to be sure it doesn't go below zero,
3465              * as the real string could be shorter */
3466             if (OP(scan) == EXACTFL) {
3467                 int total_chars = utf8_length((U8*) STRING(scan),
3468                                            (U8*) STRING(scan) + STR_LEN(scan));
3469                 if (total_count_delta > total_chars) {
3470                     total_count_delta = total_chars;
3471                 }
3472             }
3473
3474             *min_subtract += total_count_delta;
3475             Safefree(folded);
3476         }
3477         else if (OP(scan) == EXACTFA) {
3478
3479             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3480              * fold to the ASCII range (and there are no existing ones in the
3481              * upper latin1 range).  But, as outlined in the comments preceding
3482              * this function, we need to flag any occurrences of the sharp s.
3483              * This character forbids trie formation (because of added
3484              * complexity) */
3485             while (s < s_end) {
3486                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3487                     OP(scan) = EXACTFA_NO_TRIE;
3488                     *unfolded_multi_char = TRUE;
3489                     break;
3490                 }
3491                 s++;
3492                 continue;
3493             }
3494         }
3495         else {
3496
3497             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3498              * folds that are all Latin1.  As explained in the comments
3499              * preceding this function, we look also for the sharp s in EXACTF
3500              * and EXACTFL nodes; it can be in the final position.  Otherwise
3501              * we can stop looking 1 byte earlier because have to find at least
3502              * two characters for a multi-fold */
3503             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3504                               ? s_end
3505                               : s_end -1;
3506
3507             while (s < upper) {
3508                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3509                 if (! len) {    /* Not a multi-char fold. */
3510                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3511                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3512                     {
3513                         *unfolded_multi_char = TRUE;
3514                     }
3515                     s++;
3516                     continue;
3517                 }
3518
3519                 if (len == 2
3520                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3521                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3522                 {
3523
3524                     /* EXACTF nodes need to know that the minimum length
3525                      * changed so that a sharp s in the string can match this
3526                      * ss in the pattern, but they remain EXACTF nodes, as they
3527                      * won't match this unless the target string is is UTF-8,
3528                      * which we don't know until runtime.  EXACTFL nodes can't
3529                      * transform into EXACTFU nodes */
3530                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3531                         OP(scan) = EXACTFU_SS;
3532                     }
3533                 }
3534
3535                 *min_subtract += len - 1;
3536                 s += len;
3537             }
3538         }
3539     }
3540
3541 #ifdef DEBUGGING
3542     /* Allow dumping but overwriting the collection of skipped
3543      * ops and/or strings with fake optimized ops */
3544     n = scan + NODE_SZ_STR(scan);
3545     while (n <= stop) {
3546         OP(n) = OPTIMIZED;
3547         FLAGS(n) = 0;
3548         NEXT_OFF(n) = 0;
3549         n++;
3550     }
3551 #endif
3552     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3553     return stopnow;
3554 }
3555
3556 /* REx optimizer.  Converts nodes into quicker variants "in place".
3557    Finds fixed substrings.  */
3558
3559 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3560    to the position after last scanned or to NULL. */
3561
3562 #define INIT_AND_WITHP \
3563     assert(!and_withp); \
3564     Newx(and_withp,1, regnode_ssc); \
3565     SAVEFREEPV(and_withp)
3566
3567 /* this is a chain of data about sub patterns we are processing that
3568    need to be handled separately/specially in study_chunk. Its so
3569    we can simulate recursion without losing state.  */
3570 struct scan_frame;
3571 typedef struct scan_frame {
3572     regnode *last;  /* last node to process in this frame */
3573     regnode *next;  /* next node to process when last is reached */
3574     struct scan_frame *prev; /*previous frame*/
3575     U32 prev_recursed_depth;
3576     I32 stop; /* what stopparen do we use */
3577 } scan_frame;
3578
3579
3580 STATIC SSize_t
3581 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3582                         SSize_t *minlenp, SSize_t *deltap,
3583                         regnode *last,
3584                         scan_data_t *data,
3585                         I32 stopparen,
3586                         U32 recursed_depth,
3587                         regnode_ssc *and_withp,
3588                         U32 flags, U32 depth)
3589                         /* scanp: Start here (read-write). */
3590                         /* deltap: Write maxlen-minlen here. */
3591                         /* last: Stop before this one. */
3592                         /* data: string data about the pattern */
3593                         /* stopparen: treat close N as END */
3594                         /* recursed: which subroutines have we recursed into */
3595                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3596 {
3597     dVAR;
3598     /* There must be at least this number of characters to match */
3599     SSize_t min = 0;
3600     I32 pars = 0, code;
3601     regnode *scan = *scanp, *next;
3602     SSize_t delta = 0;
3603     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3604     int is_inf_internal = 0;            /* The studied chunk is infinite */
3605     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3606     scan_data_t data_fake;
3607     SV *re_trie_maxbuff = NULL;
3608     regnode *first_non_open = scan;
3609     SSize_t stopmin = SSize_t_MAX;
3610     scan_frame *frame = NULL;
3611     GET_RE_DEBUG_FLAGS_DECL;
3612
3613     PERL_ARGS_ASSERT_STUDY_CHUNK;
3614
3615 #ifdef DEBUGGING
3616     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3617 #endif
3618     if ( depth == 0 ) {
3619         while (first_non_open && OP(first_non_open) == OPEN)
3620             first_non_open=regnext(first_non_open);
3621     }
3622
3623
3624   fake_study_recurse:
3625     while ( scan && OP(scan) != END && scan < last ){
3626         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3627                                    node length to get a real minimum (because
3628                                    the folded version may be shorter) */
3629         bool unfolded_multi_char = FALSE;
3630         /* Peephole optimizer: */
3631         DEBUG_OPTIMISE_MORE_r(
3632         {
3633             PerlIO_printf(Perl_debug_log,
3634                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3635                 ((int) depth*2), "", (long)stopparen,
3636                 (unsigned long)depth, (unsigned long)recursed_depth);
3637             if (recursed_depth) {
3638                 U32 i;
3639                 U32 j;
3640                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3641                     PerlIO_printf(Perl_debug_log,"[");
3642                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3643                         PerlIO_printf(Perl_debug_log,"%d",
3644                             PAREN_TEST(RExC_study_chunk_recursed +
3645                                        (j * RExC_study_chunk_recursed_bytes), i)
3646                             ? 1 : 0
3647                         );
3648                     PerlIO_printf(Perl_debug_log,"]");
3649                 }
3650             }
3651             PerlIO_printf(Perl_debug_log,"\n");
3652         }
3653         );
3654         DEBUG_STUDYDATA("Peep:", data, depth);
3655         DEBUG_PEEP("Peep", scan, depth);
3656
3657
3658         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3659          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3660          * by a different invocation of reg() -- Yves
3661          */
3662         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3663
3664         /* Follow the next-chain of the current node and optimize
3665            away all the NOTHINGs from it.  */
3666         if (OP(scan) != CURLYX) {
3667             const int max = (reg_off_by_arg[OP(scan)]
3668                        ? I32_MAX
3669                        /* I32 may be smaller than U16 on CRAYs! */
3670                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3671             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3672             int noff;
3673             regnode *n = scan;
3674
3675             /* Skip NOTHING and LONGJMP. */
3676             while ((n = regnext(n))
3677                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3678                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3679                    && off + noff < max)
3680                 off += noff;
3681             if (reg_off_by_arg[OP(scan)])
3682                 ARG(scan) = off;
3683             else
3684                 NEXT_OFF(scan) = off;
3685         }
3686
3687
3688
3689         /* The principal pseudo-switch.  Cannot be a switch, since we
3690            look into several different things.  */
3691         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3692                    || OP(scan) == IFTHEN) {
3693             next = regnext(scan);
3694             code = OP(scan);
3695             /* demq: the op(next)==code check is to see if we have
3696              * "branch-branch" AFAICT */
3697
3698             if (OP(next) == code || code == IFTHEN) {
3699                 /* NOTE - There is similar code to this block below for
3700                  * handling TRIE nodes on a re-study.  If you change stuff here
3701                  * check there too. */
3702                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3703                 regnode_ssc accum;
3704                 regnode * const startbranch=scan;
3705
3706                 if (flags & SCF_DO_SUBSTR) {
3707                     /* Cannot merge strings after this. */
3708                     scan_commit(pRExC_state, data, minlenp, is_inf);
3709                 }
3710
3711                 if (flags & SCF_DO_STCLASS)
3712                     ssc_init_zero(pRExC_state, &accum);
3713
3714                 while (OP(scan) == code) {
3715                     SSize_t deltanext, minnext, fake;
3716                     I32 f = 0;
3717                     regnode_ssc this_class;
3718
3719                     num++;
3720                     data_fake.flags = 0;
3721                     if (data) {
3722                         data_fake.whilem_c = data->whilem_c;
3723                         data_fake.last_closep = data->last_closep;
3724                     }
3725                     else
3726                         data_fake.last_closep = &fake;
3727
3728                     data_fake.pos_delta = delta;
3729                     next = regnext(scan);
3730                     scan = NEXTOPER(scan);
3731                     if (code != BRANCH)
3732                         scan = NEXTOPER(scan);
3733                     if (flags & SCF_DO_STCLASS) {
3734                         ssc_init(pRExC_state, &this_class);
3735                         data_fake.start_class = &this_class;
3736                         f = SCF_DO_STCLASS_AND;
3737                     }
3738                     if (flags & SCF_WHILEM_VISITED_POS)
3739                         f |= SCF_WHILEM_VISITED_POS;
3740
3741                     /* we suppose the run is continuous, last=next...*/
3742                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3743                                       &deltanext, next, &data_fake, stopparen,
3744                                       recursed_depth, NULL, f,depth+1);
3745                     if (min1 > minnext)
3746                         min1 = minnext;
3747                     if (deltanext == SSize_t_MAX) {
3748                         is_inf = is_inf_internal = 1;
3749                         max1 = SSize_t_MAX;
3750                     } else if (max1 < minnext + deltanext)
3751                         max1 = minnext + deltanext;
3752                     scan = next;
3753                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3754                         pars++;
3755                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3756                         if ( stopmin > minnext)
3757                             stopmin = min + min1;
3758                         flags &= ~SCF_DO_SUBSTR;
3759                         if (data)
3760                             data->flags |= SCF_SEEN_ACCEPT;
3761                     }
3762                     if (data) {
3763                         if (data_fake.flags & SF_HAS_EVAL)
3764                             data->flags |= SF_HAS_EVAL;
3765                         data->whilem_c = data_fake.whilem_c;
3766                     }
3767                     if (flags & SCF_DO_STCLASS)
3768                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3769                 }
3770                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3771                     min1 = 0;
3772                 if (flags & SCF_DO_SUBSTR) {
3773                     data->pos_min += min1;
3774                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3775                         data->pos_delta = SSize_t_MAX;
3776                     else
3777                         data->pos_delta += max1 - min1;
3778                     if (max1 != min1 || is_inf)
3779                         data->longest = &(data->longest_float);
3780                 }
3781                 min += min1;
3782                 if (delta == SSize_t_MAX
3783                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3784                     delta = SSize_t_MAX;
3785                 else
3786                     delta += max1 - min1;
3787                 if (flags & SCF_DO_STCLASS_OR) {
3788                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3789                     if (min1) {
3790                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3791                         flags &= ~SCF_DO_STCLASS;
3792                     }
3793                 }
3794                 else if (flags & SCF_DO_STCLASS_AND) {
3795                     if (min1) {
3796                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3797                         flags &= ~SCF_DO_STCLASS;
3798                     }
3799                     else {
3800                         /* Switch to OR mode: cache the old value of
3801                          * data->start_class */
3802                         INIT_AND_WITHP;
3803                         StructCopy(data->start_class, and_withp, regnode_ssc);
3804                         flags &= ~SCF_DO_STCLASS_AND;
3805                         StructCopy(&accum, data->start_class, regnode_ssc);
3806                         flags |= SCF_DO_STCLASS_OR;
3807                     }
3808                 }
3809
3810                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3811                         OP( startbranch ) == BRANCH )
3812                 {
3813                 /* demq.
3814
3815                    Assuming this was/is a branch we are dealing with: 'scan'
3816                    now points at the item that follows the branch sequence,
3817                    whatever it is. We now start at the beginning of the
3818                    sequence and look for subsequences of
3819
3820                    BRANCH->EXACT=>x1
3821                    BRANCH->EXACT=>x2
3822                    tail
3823
3824                    which would be constructed from a pattern like
3825                    /A|LIST|OF|WORDS/
3826
3827                    If we can find such a subsequence we need to turn the first
3828                    element into a trie and then add the subsequent branch exact
3829                    strings to the trie.
3830
3831                    We have two cases
3832
3833                      1. patterns where the whole set of branches can be
3834                         converted.
3835
3836                      2. patterns where only a subset can be converted.
3837
3838                    In case 1 we can replace the whole set with a single regop
3839                    for the trie. In case 2 we need to keep the start and end
3840                    branches so
3841
3842                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3843                      becomes BRANCH TRIE; BRANCH X;
3844
3845                   There is an additional case, that being where there is a
3846                   common prefix, which gets split out into an EXACT like node
3847                   preceding the TRIE node.
3848
3849                   If x(1..n)==tail then we can do a simple trie, if not we make
3850                   a "jump" trie, such that when we match the appropriate word
3851                   we "jump" to the appropriate tail node. Essentially we turn
3852                   a nested if into a case structure of sorts.
3853
3854                 */
3855
3856                     int made=0;
3857                     if (!re_trie_maxbuff) {
3858                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3859                         if (!SvIOK(re_trie_maxbuff))
3860                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3861                     }
3862                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3863                         regnode *cur;
3864                         regnode *first = (regnode *)NULL;
3865                         regnode *last = (regnode *)NULL;
3866                         regnode *tail = scan;
3867                         U8 trietype = 0;
3868                         U32 count=0;
3869
3870 #ifdef DEBUGGING
3871                         SV * const mysv = sv_newmortal();   /* for dumping */
3872 #endif
3873                         /* var tail is used because there may be a TAIL
3874                            regop in the way. Ie, the exacts will point to the
3875                            thing following the TAIL, but the last branch will
3876                            point at the TAIL. So we advance tail. If we
3877                            have nested (?:) we may have to move through several
3878                            tails.
3879                          */
3880
3881                         while ( OP( tail ) == TAIL ) {
3882                             /* this is the TAIL generated by (?:) */
3883                             tail = regnext( tail );
3884                         }
3885
3886
3887                         DEBUG_TRIE_COMPILE_r({
3888                             regprop(RExC_rx, mysv, tail, NULL);
3889                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3890                               (int)depth * 2 + 2, "",
3891                               "Looking for TRIE'able sequences. Tail node is: ",
3892                               SvPV_nolen_const( mysv )
3893                             );
3894                         });
3895
3896                         /*
3897
3898                             Step through the branches
3899                                 cur represents each branch,
3900                                 noper is the first thing to be matched as part
3901                                       of that branch
3902                                 noper_next is the regnext() of that node.
3903
3904                             We normally handle a case like this
3905                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3906                             support building with NOJUMPTRIE, which restricts
3907                             the trie logic to structures like /FOO|BAR/.
3908
3909                             If noper is a trieable nodetype then the branch is
3910                             a possible optimization target. If we are building
3911                             under NOJUMPTRIE then we require that noper_next is
3912                             the same as scan (our current position in the regex
3913                             program).
3914
3915                             Once we have two or more consecutive such branches
3916                             we can create a trie of the EXACT's contents and
3917                             stitch it in place into the program.
3918
3919                             If the sequence represents all of the branches in
3920                             the alternation we replace the entire thing with a
3921                             single TRIE node.
3922
3923                             Otherwise when it is a subsequence we need to
3924                             stitch it in place and replace only the relevant
3925                             branches. This means the first branch has to remain
3926                             as it is used by the alternation logic, and its
3927                             next pointer, and needs to be repointed at the item
3928                             on the branch chain following the last branch we
3929                             have optimized away.
3930
3931                             This could be either a BRANCH, in which case the
3932                             subsequence is internal, or it could be the item
3933                             following the branch sequence in which case the
3934                             subsequence is at the end (which does not
3935                             necessarily mean the first node is the start of the
3936                             alternation).
3937
3938                             TRIE_TYPE(X) is a define which maps the optype to a
3939                             trietype.
3940
3941                                 optype          |  trietype
3942                                 ----------------+-----------
3943                                 NOTHING         | NOTHING
3944                                 EXACT           | EXACT
3945                                 EXACTFU         | EXACTFU
3946                                 EXACTFU_SS      | EXACTFU
3947                                 EXACTFA         | EXACTFA
3948
3949
3950                         */
3951 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3952                        ( EXACT == (X) )   ? EXACT :        \
3953                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3954                        ( EXACTFA == (X) ) ? EXACTFA :        \
3955                        0 )
3956
3957                         /* dont use tail as the end marker for this traverse */
3958                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3959                             regnode * const noper = NEXTOPER( cur );
3960                             U8 noper_type = OP( noper );
3961                             U8 noper_trietype = TRIE_TYPE( noper_type );
3962 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3963                             regnode * const noper_next = regnext( noper );
3964                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3965                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3966 #endif
3967
3968                             DEBUG_TRIE_COMPILE_r({
3969                                 regprop(RExC_rx, mysv, cur, NULL);
3970                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3971                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3972
3973                                 regprop(RExC_rx, mysv, noper, NULL);
3974                                 PerlIO_printf( Perl_debug_log, " -> %s",
3975                                     SvPV_nolen_const(mysv));
3976
3977                                 if ( noper_next ) {
3978                                   regprop(RExC_rx, mysv, noper_next, NULL);
3979                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3980                                     SvPV_nolen_const(mysv));
3981                                 }
3982                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3983                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3984                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3985                                 );
3986                             });
3987
3988                             /* Is noper a trieable nodetype that can be merged
3989                              * with the current trie (if there is one)? */
3990                             if ( noper_trietype
3991                                   &&
3992                                   (
3993                                         ( noper_trietype == NOTHING)
3994                                         || ( trietype == NOTHING )
3995                                         || ( trietype == noper_trietype )
3996                                   )
3997 #ifdef NOJUMPTRIE
3998                                   && noper_next == tail
3999 #endif
4000                                   && count < U16_MAX)
4001                             {
4002                                 /* Handle mergable triable node Either we are
4003                                  * the first node in a new trieable sequence,
4004                                  * in which case we do some bookkeeping,
4005                                  * otherwise we update the end pointer. */
4006                                 if ( !first ) {
4007                                     first = cur;
4008                                     if ( noper_trietype == NOTHING ) {
4009 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4010                                         regnode * const noper_next = regnext( noper );
4011                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4012                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4013 #endif
4014
4015                                         if ( noper_next_trietype ) {
4016                                             trietype = noper_next_trietype;
4017                                         } else if (noper_next_type)  {
4018                                             /* a NOTHING regop is 1 regop wide.
4019                                              * We need at least two for a trie
4020                                              * so we can't merge this in */
4021                                             first = NULL;
4022                                         }
4023                                     } else {
4024                                         trietype = noper_trietype;
4025                                     }
4026                                 } else {
4027                                     if ( trietype == NOTHING )
4028                                         trietype = noper_trietype;
4029                                     last = cur;
4030                                 }
4031                                 if (first)
4032                                     count++;
4033                             } /* end handle mergable triable node */
4034                             else {
4035                                 /* handle unmergable node -
4036                                  * noper may either be a triable node which can
4037                                  * not be tried together with the current trie,
4038                                  * or a non triable node */
4039                                 if ( last ) {
4040                                     /* If last is set and trietype is not
4041                                      * NOTHING then we have found at least two
4042                                      * triable branch sequences in a row of a
4043                                      * similar trietype so we can turn them
4044                                      * into a trie. If/when we allow NOTHING to
4045                                      * start a trie sequence this condition
4046                                      * will be required, and it isn't expensive
4047                                      * so we leave it in for now. */
4048                                     if ( trietype && trietype != NOTHING )
4049                                         make_trie( pRExC_state,
4050                                                 startbranch, first, cur, tail,
4051                                                 count, trietype, depth+1 );
4052                                     last = NULL; /* note: we clear/update
4053                                                     first, trietype etc below,
4054                                                     so we dont do it here */
4055                                 }
4056                                 if ( noper_trietype
4057 #ifdef NOJUMPTRIE
4058                                      && noper_next == tail
4059 #endif
4060                                 ){
4061                                     /* noper is triable, so we can start a new
4062                                      * trie sequence */
4063                                     count = 1;
4064                                     first = cur;
4065                                     trietype = noper_trietype;
4066                                 } else if (first) {
4067                                     /* if we already saw a first but the
4068                                      * current node is not triable then we have
4069                                      * to reset the first information. */
4070                                     count = 0;
4071                                     first = NULL;
4072                                     trietype = 0;
4073                                 }
4074                             } /* end handle unmergable node */
4075                         } /* loop over branches */
4076                         DEBUG_TRIE_COMPILE_r({
4077                             regprop(RExC_rx, mysv, cur, NULL);
4078                             PerlIO_printf( Perl_debug_log,
4079                               "%*s- %s (%d) <SCAN FINISHED>\n",
4080                               (int)depth * 2 + 2,
4081                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4082
4083                         });
4084                         if ( last && trietype ) {
4085                             if ( trietype != NOTHING ) {
4086                                 /* the last branch of the sequence was part of
4087                                  * a trie, so we have to construct it here
4088                                  * outside of the loop */
4089                                 made= make_trie( pRExC_state, startbranch,
4090                                                  first, scan, tail, count,
4091                                                  trietype, depth+1 );
4092 #ifdef TRIE_STUDY_OPT
4093                                 if ( ((made == MADE_EXACT_TRIE &&
4094                                      startbranch == first)
4095                                      || ( first_non_open == first )) &&
4096                                      depth==0 ) {
4097                                     flags |= SCF_TRIE_RESTUDY;
4098                                     if ( startbranch == first
4099                                          && scan == tail )
4100                                     {
4101                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4102                                     }
4103                                 }
4104 #endif
4105                             } else {
4106                                 /* at this point we know whatever we have is a
4107                                  * NOTHING sequence/branch AND if 'startbranch'
4108                                  * is 'first' then we can turn the whole thing
4109                                  * into a NOTHING
4110                                  */
4111                                 if ( startbranch == first ) {
4112                                     regnode *opt;
4113                                     /* the entire thing is a NOTHING sequence,
4114                                      * something like this: (?:|) So we can
4115                                      * turn it into a plain NOTHING op. */
4116                                     DEBUG_TRIE_COMPILE_r({
4117                                         regprop(RExC_rx, mysv, cur, NULL);
4118                                         PerlIO_printf( Perl_debug_log,
4119                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4120                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4121
4122                                     });
4123                                     OP(startbranch)= NOTHING;
4124                                     NEXT_OFF(startbranch)= tail - startbranch;
4125                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4126                                         OP(opt)= OPTIMIZED;
4127                                 }
4128                             }
4129                         } /* end if ( last) */
4130                     } /* TRIE_MAXBUF is non zero */
4131
4132                 } /* do trie */
4133
4134             }
4135             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4136                 scan = NEXTOPER(NEXTOPER(scan));
4137             } else                      /* single branch is optimized. */
4138                 scan = NEXTOPER(scan);
4139             continue;
4140         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4141             scan_frame *newframe = NULL;
4142             I32 paren;
4143             regnode *start;
4144             regnode *end;
4145             U32 my_recursed_depth= recursed_depth;
4146
4147             if (OP(scan) != SUSPEND) {
4148                 /* set the pointer */
4149                 if (OP(scan) == GOSUB) {
4150                     paren = ARG(scan);
4151                     RExC_recurse[ARG2L(scan)] = scan;
4152                     start = RExC_open_parens[paren-1];
4153                     end   = RExC_close_parens[paren-1];
4154                 } else {
4155                     paren = 0;
4156                     start = RExC_rxi->program + 1;
4157                     end   = RExC_opend;
4158                 }
4159                 if (!recursed_depth
4160                     ||
4161                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4162                 ) {
4163                     if (!recursed_depth) {
4164                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4165                     } else {
4166                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4167                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4168                              RExC_study_chunk_recursed_bytes, U8);
4169                     }
4170                     /* we havent recursed into this paren yet, so recurse into it */
4171                     DEBUG_STUDYDATA("set:", data,depth);
4172                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4173                     my_recursed_depth= recursed_depth + 1;
4174                     Newx(newframe,1,scan_frame);
4175                 } else {
4176                     DEBUG_STUDYDATA("inf:", data,depth);
4177                     /* some form of infinite recursion, assume infinite length
4178                      * */
4179                     if (flags & SCF_DO_SUBSTR) {
4180                         scan_commit(pRExC_state, data, minlenp, is_inf);
4181                         data->longest = &(data->longest_float);
4182                     }
4183                     is_inf = is_inf_internal = 1;
4184                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4185                         ssc_anything(data->start_class);
4186                     flags &= ~SCF_DO_STCLASS;
4187                 }
4188             } else {
4189                 Newx(newframe,1,scan_frame);
4190                 paren = stopparen;
4191                 start = scan+2;
4192                 end = regnext(scan);
4193             }
4194             if (newframe) {
4195                 assert(start);
4196                 assert(end);
4197                 SAVEFREEPV(newframe);
4198                 newframe->next = regnext(scan);
4199                 newframe->last = last;
4200                 newframe->stop = stopparen;
4201                 newframe->prev = frame;
4202                 newframe->prev_recursed_depth = recursed_depth;
4203
4204                 DEBUG_STUDYDATA("frame-new:",data,depth);
4205                 DEBUG_PEEP("fnew", scan, depth);
4206
4207                 frame = newframe;
4208                 scan =  start;
4209                 stopparen = paren;
4210                 last = end;
4211                 depth = depth + 1;
4212                 recursed_depth= my_recursed_depth;
4213
4214                 continue;
4215             }
4216         }
4217         else if (OP(scan) == EXACT) {
4218             SSize_t l = STR_LEN(scan);
4219             UV uc;
4220             if (UTF) {
4221                 const U8 * const s = (U8*)STRING(scan);
4222                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4223                 l = utf8_length(s, s + l);
4224             } else {
4225                 uc = *((U8*)STRING(scan));
4226             }
4227             min += l;
4228             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4229                 /* The code below prefers earlier match for fixed
4230                    offset, later match for variable offset.  */
4231                 if (data->last_end == -1) { /* Update the start info. */
4232                     data->last_start_min = data->pos_min;
4233                     data->last_start_max = is_inf
4234                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4235                 }
4236                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4237                 if (UTF)
4238                     SvUTF8_on(data->last_found);
4239                 {
4240                     SV * const sv = data->last_found;
4241                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4242                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4243                     if (mg && mg->mg_len >= 0)
4244                         mg->mg_len += utf8_length((U8*)STRING(scan),
4245                                               (U8*)STRING(scan)+STR_LEN(scan));
4246                 }
4247                 data->last_end = data->pos_min + l;
4248                 data->pos_min += l; /* As in the first entry. */
4249                 data->flags &= ~SF_BEFORE_EOL;
4250             }
4251
4252             /* ANDing the code point leaves at most it, and not in locale, and
4253              * can't match null string */
4254             if (flags & SCF_DO_STCLASS_AND) {
4255                 ssc_cp_and(data->start_class, uc);
4256                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4257                 ssc_clear_locale(data->start_class);
4258             }
4259             else if (flags & SCF_DO_STCLASS_OR) {
4260                 ssc_add_cp(data->start_class, uc);
4261                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4262
4263                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4264                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4265             }
4266             flags &= ~SCF_DO_STCLASS;
4267         }
4268         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4269                                                      EXACTFish */
4270             SSize_t l = STR_LEN(scan);
4271             UV uc = *((U8*)STRING(scan));
4272             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4273                                                      separate code points */
4274             const U8 * s = (U8*)STRING(scan);
4275
4276             /* Search for fixed substrings supports EXACT only. */
4277             if (flags & SCF_DO_SUBSTR) {
4278                 assert(data);
4279                 scan_commit(pRExC_state, data, minlenp, is_inf);
4280             }
4281             if (UTF) {
4282                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4283                 l = utf8_length(s, s + l);
4284             }
4285             if (unfolded_multi_char) {
4286                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4287             }
4288             min += l - min_subtract;
4289             assert (min >= 0);
4290             delta += min_subtract;
4291             if (flags & SCF_DO_SUBSTR) {
4292                 data->pos_min += l - min_subtract;
4293                 if (data->pos_min < 0) {
4294                     data->pos_min = 0;
4295                 }
4296                 data->pos_delta += min_subtract;
4297                 if (min_subtract) {
4298                     data->longest = &(data->longest_float);
4299                 }
4300             }
4301
4302             if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4303                 ssc_clear_locale(data->start_class);
4304             }
4305
4306             if (! UTF) {
4307
4308                 /* We punt and assume can match anything if the node begins
4309                  * with a multi-character fold.  Things are complicated.  For
4310                  * example, /ffi/i could match any of:
4311                  *  "\N{LATIN SMALL LIGATURE FFI}"
4312                  *  "\N{LATIN SMALL LIGATURE FF}I"
4313                  *  "F\N{LATIN SMALL LIGATURE FI}"
4314                  *  plus several other things; and making sure we have all the
4315                  *  possibilities is hard. */
4316                 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4317                     EXACTF_invlist =
4318                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4319                 }
4320                 else {
4321
4322                     /* Any Latin1 range character can potentially match any
4323                      * other depending on the locale */
4324                     if (OP(scan) == EXACTFL) {
4325                         _invlist_union(EXACTF_invlist, PL_Latin1,
4326                                                               &EXACTF_invlist);
4327                     }
4328                     else {
4329                         /* But otherwise, it matches at least itself.  We can
4330                          * quickly tell if it has a distinct fold, and if so,
4331                          * it matches that as well */
4332                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4333                         if (IS_IN_SOME_FOLD_L1(uc)) {
4334                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4335                                                            PL_fold_latin1[uc]);
4336                         }
4337                     }
4338
4339                     /* Some characters match above-Latin1 ones under /i.  This
4340                      * is true of EXACTFL ones when the locale is UTF-8 */
4341                     if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4342                         && (! isASCII(uc) || (OP(scan) != EXACTFA
4343                                             && OP(scan) != EXACTFA_NO_TRIE)))
4344                     {
4345                         add_above_Latin1_folds(pRExC_state,
4346                                                (U8) uc,
4347                                                &EXACTF_invlist);
4348                     }
4349                 }
4350             }
4351             else {  /* Pattern is UTF-8 */
4352                 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4353                 STRLEN foldlen = UTF8SKIP(s);
4354                 const U8* e = s + STR_LEN(scan);
4355                 SV** listp;
4356
4357                 /* The only code points that aren't folded in a UTF EXACTFish
4358                  * node are are the problematic ones in EXACTFL nodes */
4359                 if (OP(scan) == EXACTFL
4360                     && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4361                 {
4362                     /* We need to check for the possibility that this EXACTFL
4363                      * node begins with a multi-char fold.  Therefore we fold
4364                      * the first few characters of it so that we can make that
4365                      * check */
4366                     U8 *d = folded;
4367                     int i;
4368
4369                     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4370                         if (isASCII(*s)) {
4371                             *(d++) = (U8) toFOLD(*s);
4372                             s++;
4373                         }
4374                         else {
4375                             STRLEN len;
4376                             to_utf8_fold(s, d, &len);
4377                             d += len;
4378                             s += UTF8SKIP(s);
4379                         }
4380                     }
4381
4382                     /* And set up so the code below that looks in this folded
4383                      * buffer instead of the node's string */
4384                     e = d;
4385                     foldlen = UTF8SKIP(folded);
4386                     s = folded;
4387                 }
4388
4389                 /* When we reach here 's' points to the fold of the first
4390                  * character(s) of the node; and 'e' points to far enough along
4391                  * the folded string to be just past any possible multi-char
4392                  * fold. 'foldlen' is the length in bytes of the first
4393                  * character in 's'
4394                  *
4395                  * Unlike the non-UTF-8 case, the macro for determining if a
4396                  * string is a multi-char fold requires all the characters to
4397                  * already be folded.  This is because of all the complications
4398                  * if not.  Note that they are folded anyway, except in EXACTFL
4399                  * nodes.  Like the non-UTF case above, we punt if the node
4400                  * begins with a multi-char fold  */
4401
4402                 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4403                     EXACTF_invlist =
4404                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4405                 }
4406                 else {  /* Single char fold */
4407
4408                     /* It matches all the things that fold to it, which are
4409                      * found in PL_utf8_foldclosures (including itself) */
4410                     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4411                     if (! PL_utf8_foldclosures) {
4412                         _load_PL_utf8_foldclosures();
4413                     }
4414                     if ((listp = hv_fetch(PL_utf8_foldclosures,
4415                                         (char *) s, foldlen, FALSE)))
4416                     {
4417                         AV* list = (AV*) *listp;
4418                         IV k;
4419                         for (k = 0; k <= av_tindex(list); k++) {
4420                             SV** c_p = av_fetch(list, k, FALSE);
4421                             UV c;
4422                             assert(c_p);
4423
4424                             c = SvUV(*c_p);
4425
4426                             /* /aa doesn't allow folds between ASCII and non- */
4427                             if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4428                                 && isASCII(c) != isASCII(uc))
4429                             {
4430                                 continue;
4431                             }
4432
4433                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4434                         }
4435                     }
4436                 }
4437             }
4438             if (flags & SCF_DO_STCLASS_AND) {
4439                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4440                 ANYOF_POSIXL_ZERO(data->start_class);
4441                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4442             }
4443             else if (flags & SCF_DO_STCLASS_OR) {
4444                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4445                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4446
4447                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4448                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4449             }
4450             flags &= ~SCF_DO_STCLASS;
4451             SvREFCNT_dec(EXACTF_invlist);
4452         }
4453         else if (REGNODE_VARIES(OP(scan))) {
4454             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4455             I32 fl = 0, f = flags;
4456             regnode * const oscan = scan;
4457             regnode_ssc this_class;
4458             regnode_ssc *oclass = NULL;
4459             I32 next_is_eval = 0;
4460
4461             switch (PL_regkind[OP(scan)]) {
4462             case WHILEM:                /* End of (?:...)* . */
4463                 scan = NEXTOPER(scan);
4464                 goto finish;
4465             case PLUS:
4466                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4467                     next = NEXTOPER(scan);
4468                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4469                         mincount = 1;
4470                         maxcount = REG_INFTY;
4471                         next = regnext(scan);
4472                         scan = NEXTOPER(scan);
4473                         goto do_curly;
4474                     }
4475                 }
4476                 if (flags & SCF_DO_SUBSTR)
4477                     data->pos_min++;
4478                 min++;
4479                 /* FALLTHROUGH */
4480             case STAR:
4481                 if (flags & SCF_DO_STCLASS) {
4482                     mincount = 0;
4483                     maxcount = REG_INFTY;
4484                     next = regnext(scan);
4485                     scan = NEXTOPER(scan);
4486                     goto do_curly;
4487                 }
4488                 if (flags & SCF_DO_SUBSTR) {
4489                     scan_commit(pRExC_state, data, minlenp, is_inf);
4490                     /* Cannot extend fixed substrings */
4491                     data->longest = &(data->longest_float);
4492                 }
4493                 is_inf = is_inf_internal = 1;
4494                 scan = regnext(scan);
4495                 goto optimize_curly_tail;
4496             case CURLY:
4497                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4498                     && (scan->flags == stopparen))
4499                 {
4500                     mincount = 1;
4501                     maxcount = 1;
4502                 } else {
4503                     mincount = ARG1(scan);
4504                     maxcount = ARG2(scan);
4505                 }
4506                 next = regnext(scan);
4507                 if (OP(scan) == CURLYX) {
4508                     I32 lp = (data ? *(data->last_closep) : 0);
4509                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4510                 }
4511                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4512                 next_is_eval = (OP(scan) == EVAL);
4513               do_curly:
4514                 if (flags & SCF_DO_SUBSTR) {
4515                     if (mincount == 0)
4516                         scan_commit(pRExC_state, data, minlenp, is_inf);
4517                     /* Cannot extend fixed substrings */
4518                     pos_before = data->pos_min;
4519                 }
4520                 if (data) {
4521                     fl = data->flags;
4522                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4523                     if (is_inf)
4524                         data->flags |= SF_IS_INF;
4525                 }
4526                 if (flags & SCF_DO_STCLASS) {
4527                     ssc_init(pRExC_state, &this_class);
4528                     oclass = data->start_class;
4529                     data->start_class = &this_class;
4530                     f |= SCF_DO_STCLASS_AND;
4531                     f &= ~SCF_DO_STCLASS_OR;
4532                 }
4533                 /* Exclude from super-linear cache processing any {n,m}
4534                    regops for which the combination of input pos and regex
4535                    pos is not enough information to determine if a match
4536                    will be possible.
4537
4538                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4539                    regex pos at the \s*, the prospects for a match depend not
4540                    only on the input position but also on how many (bar\s*)
4541                    repeats into the {4,8} we are. */
4542                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4543                     f &= ~SCF_WHILEM_VISITED_POS;
4544
4545                 /* This will finish on WHILEM, setting scan, or on NULL: */
4546                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4547                                   last, data, stopparen, recursed_depth, NULL,
4548                                   (mincount == 0
4549                                    ? (f & ~SCF_DO_SUBSTR)
4550                                    : f)
4551                                   ,depth+1);
4552
4553                 if (flags & SCF_DO_STCLASS)
4554                     data->start_class = oclass;
4555                 if (mincount == 0 || minnext == 0) {
4556                     if (flags & SCF_DO_STCLASS_OR) {
4557                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4558                     }
4559                     else if (flags & SCF_DO_STCLASS_AND) {
4560                         /* Switch to OR mode: cache the old value of
4561                          * data->start_class */
4562                         INIT_AND_WITHP;
4563                         StructCopy(data->start_class, and_withp, regnode_ssc);
4564                         flags &= ~SCF_DO_STCLASS_AND;
4565                         StructCopy(&this_class, data->start_class, regnode_ssc);
4566                         flags |= SCF_DO_STCLASS_OR;
4567                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4568                     }
4569                 } else {                /* Non-zero len */
4570                     if (flags & SCF_DO_STCLASS_OR) {
4571                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4572                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4573                     }
4574                     else if (flags & SCF_DO_STCLASS_AND)
4575                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4576                     flags &= ~SCF_DO_STCLASS;
4577                 }
4578                 if (!scan)              /* It was not CURLYX, but CURLY. */
4579                     scan = next;
4580                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4581                     /* ? quantifier ok, except for (?{ ... }) */
4582                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4583                     && (minnext == 0) && (deltanext == 0)
4584                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4585                     && maxcount <= REG_INFTY/3) /* Complement check for big
4586                                                    count */
4587                 {
4588                     /* Fatal warnings may leak the regexp without this: */
4589                     SAVEFREESV(RExC_rx_sv);
4590                     ckWARNreg(RExC_parse,
4591                             "Quantifier unexpected on zero-length expression");
4592                     (void)ReREFCNT_inc(RExC_rx_sv);
4593                 }
4594
4595                 min += minnext * mincount;
4596                 is_inf_internal |= deltanext == SSize_t_MAX
4597                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4598                 is_inf |= is_inf_internal;
4599                 if (is_inf) {
4600                     delta = SSize_t_MAX;
4601                 } else {
4602                     delta += (minnext + deltanext) * maxcount
4603                              - minnext * mincount;
4604                 }
4605                 /* Try powerful optimization CURLYX => CURLYN. */
4606                 if (  OP(oscan) == CURLYX && data
4607                       && data->flags & SF_IN_PAR
4608                       && !(data->flags & SF_HAS_EVAL)
4609                       && !deltanext && minnext == 1 ) {
4610                     /* Try to optimize to CURLYN.  */
4611                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4612                     regnode * const nxt1 = nxt;
4613 #ifdef DEBUGGING
4614                     regnode *nxt2;
4615 #endif
4616
4617                     /* Skip open. */
4618                     nxt = regnext(nxt);
4619                     if (!REGNODE_SIMPLE(OP(nxt))
4620                         && !(PL_regkind[OP(nxt)] == EXACT
4621                              && STR_LEN(nxt) == 1))
4622                         goto nogo;
4623 #ifdef DEBUGGING
4624                     nxt2 = nxt;
4625 #endif
4626                     nxt = regnext(nxt);
4627                     if (OP(nxt) != CLOSE)
4628                         goto nogo;
4629                     if (RExC_open_parens) {
4630                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4631                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4632                     }
4633                     /* Now we know that nxt2 is the only contents: */
4634                     oscan->flags = (U8)ARG(nxt);
4635                     OP(oscan) = CURLYN;
4636                     OP(nxt1) = NOTHING; /* was OPEN. */
4637
4638 #ifdef DEBUGGING
4639                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4640                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4641                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4642                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4643                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4644                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4645 #endif
4646                 }
4647               nogo:
4648
4649                 /* Try optimization CURLYX => CURLYM. */
4650                 if (  OP(oscan) == CURLYX && data
4651                       && !(data->flags & SF_HAS_PAR)
4652                       && !(data->flags & SF_HAS_EVAL)
4653                       && !deltanext     /* atom is fixed width */
4654                       && minnext != 0   /* CURLYM can't handle zero width */
4655
4656                          /* Nor characters whose fold at run-time may be
4657                           * multi-character */
4658                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4659                 ) {
4660                     /* XXXX How to optimize if data == 0? */
4661                     /* Optimize to a simpler form.  */
4662                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4663                     regnode *nxt2;
4664
4665                     OP(oscan) = CURLYM;
4666                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4667                             && (OP(nxt2) != WHILEM))
4668                         nxt = nxt2;
4669                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4670                     /* Need to optimize away parenths. */
4671                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4672                         /* Set the parenth number.  */
4673                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4674
4675                         oscan->flags = (U8)ARG(nxt);
4676                         if (RExC_open_parens) {
4677                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4678                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4679                         }
4680                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4681                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4682
4683 #ifdef DEBUGGING
4684                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4685                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4686                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4687                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4688 #endif
4689 #if 0
4690                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4691                             regnode *nnxt = regnext(nxt1);
4692                             if (nnxt == nxt) {
4693                                 if (reg_off_by_arg[OP(nxt1)])
4694                                     ARG_SET(nxt1, nxt2 - nxt1);
4695                                 else if (nxt2 - nxt1 < U16_MAX)
4696                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4697                                 else
4698                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4699                             }
4700                             nxt1 = nnxt;
4701                         }
4702 #endif
4703                         /* Optimize again: */
4704                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4705                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4706                     }
4707                     else
4708                         oscan->flags = 0;
4709                 }
4710                 else if ((OP(oscan) == CURLYX)
4711                          && (flags & SCF_WHILEM_VISITED_POS)
4712                          /* See the comment on a similar expression above.
4713                             However, this time it's not a subexpression
4714                             we care about, but the expression itself. */
4715                          && (maxcount == REG_INFTY)
4716                          && data && ++data->whilem_c < 16) {
4717                     /* This stays as CURLYX, we can put the count/of pair. */
4718                     /* Find WHILEM (as in regexec.c) */
4719                     regnode *nxt = oscan + NEXT_OFF(oscan);
4720
4721                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4722                         nxt += ARG(nxt);
4723                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4724                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4725                 }
4726                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4727                     pars++;
4728                 if (flags & SCF_DO_SUBSTR) {
4729                     SV *last_str = NULL;
4730                     STRLEN last_chrs = 0;
4731                     int counted = mincount != 0;
4732
4733                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4734                                                                   string. */
4735                         SSize_t b = pos_before >= data->last_start_min
4736                             ? pos_before : data->last_start_min;
4737                         STRLEN l;
4738                         const char * const s = SvPV_const(data->last_found, l);
4739                         SSize_t old = b - data->last_start_min;
4740
4741                         if (UTF)
4742                             old = utf8_hop((U8*)s, old) - (U8*)s;
4743                         l -= old;
4744                         /* Get the added string: */
4745                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4746                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4747                                             (U8*)(s + old + l)) : l;
4748                         if (deltanext == 0 && pos_before == b) {
4749                             /* What was added is a constant string */
4750                             if (mincount > 1) {
4751
4752                                 SvGROW(last_str, (mincount * l) + 1);
4753                                 repeatcpy(SvPVX(last_str) + l,
4754                                           SvPVX_const(last_str), l,
4755                                           mincount - 1);
4756                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4757                                 /* Add additional parts. */
4758                                 SvCUR_set(data->last_found,
4759                                           SvCUR(data->last_found) - l);
4760                                 sv_catsv(data->last_found, last_str);
4761                                 {
4762                                     SV * sv = data->last_found;
4763                                     MAGIC *mg =
4764                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4765                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4766                                     if (mg && mg->mg_len >= 0)
4767                                         mg->mg_len += last_chrs * (mincount-1);
4768                                 }
4769                                 last_chrs *= mincount;
4770                                 data->last_end += l * (mincount - 1);
4771                             }
4772                         } else {
4773                             /* start offset must point into the last copy */
4774                             data->last_start_min += minnext * (mincount - 1);
4775                             data->last_start_max += is_inf ? SSize_t_MAX
4776                                 : (maxcount - 1) * (minnext + data->pos_delta);
4777                         }
4778                     }
4779                     /* It is counted once already... */
4780                     data->pos_min += minnext * (mincount - counted);
4781 #if 0
4782 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4783                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4784                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
4785     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4786     (UV)mincount);
4787 if (deltanext != SSize_t_MAX)
4788 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4789     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4790           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4791 #endif
4792                     if (deltanext == SSize_t_MAX
4793                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4794                         data->pos_delta = SSize_t_MAX;
4795                     else
4796                         data->pos_delta += - counted * deltanext +
4797                         (minnext + deltanext) * maxcount - minnext * mincount;
4798                     if (mincount != maxcount) {
4799                          /* Cannot extend fixed substrings found inside
4800                             the group.  */
4801                         scan_commit(pRExC_state, data, minlenp, is_inf);
4802                         if (mincount && last_str) {
4803                             SV * const sv = data->last_found;
4804                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4805                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4806
4807                             if (mg)
4808                                 mg->mg_len = -1;
4809                             sv_setsv(sv, last_str);
4810                             data->last_end = data->pos_min;
4811                             data->last_start_min = data->pos_min - last_chrs;
4812                             data->last_start_max = is_inf
4813                                 ? SSize_t_MAX
4814                                 : data->pos_min + data->pos_delta - last_chrs;
4815                         }
4816                         data->longest = &(data->longest_float);
4817                     }
4818                     SvREFCNT_dec(last_str);
4819                 }
4820                 if (data && (fl & SF_HAS_EVAL))
4821                     data->flags |= SF_HAS_EVAL;
4822               optimize_curly_tail:
4823                 if (OP(oscan) != CURLYX) {
4824                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4825                            && NEXT_OFF(next))
4826                         NEXT_OFF(oscan) += NEXT_OFF(next);
4827                 }
4828                 continue;
4829
4830             default:
4831 #ifdef DEBUGGING
4832                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4833                                                                     OP(scan));
4834 #endif
4835             case REF:
4836             case CLUMP:
4837                 if (flags & SCF_DO_SUBSTR) {
4838                     /* Cannot expect anything... */
4839                     scan_commit(pRExC_state, data, minlenp, is_inf);
4840                     data->longest = &(data->longest_float);
4841                 }
4842                 is_inf = is_inf_internal = 1;
4843                 if (flags & SCF_DO_STCLASS_OR) {
4844                     if (OP(scan) == CLUMP) {
4845                         /* Actually is any start char, but very few code points
4846                          * aren't start characters */
4847                         ssc_match_all_cp(data->start_class);
4848                     }
4849                     else {
4850                         ssc_anything(data->start_class);
4851                     }
4852                 }
4853                 flags &= ~SCF_DO_STCLASS;
4854                 break;
4855             }
4856         }
4857         else if (OP(scan) == LNBREAK) {
4858             if (flags & SCF_DO_STCLASS) {
4859                 if (flags & SCF_DO_STCLASS_AND) {
4860                     ssc_intersection(data->start_class,
4861                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4862                     ssc_clear_locale(data->start_class);
4863                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4864                 }
4865                 else if (flags & SCF_DO_STCLASS_OR) {
4866                     ssc_union(data->start_class,
4867                               PL_XPosix_ptrs[_CC_VERTSPACE],
4868                               FALSE);
4869                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4870
4871                     /* See commit msg for
4872                      * 749e076fceedeb708a624933726e7989f2302f6a */
4873                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4874                 }
4875                 flags &= ~SCF_DO_STCLASS;
4876             }
4877             min++;
4878             delta++;    /* Because of the 2 char string cr-lf */
4879             if (flags & SCF_DO_SUBSTR) {
4880                 /* Cannot expect anything... */
4881                 scan_commit(pRExC_state, data, minlenp, is_inf);
4882                 data->pos_min += 1;
4883                 data->pos_delta += 1;
4884                 data->longest = &(data->longest_float);
4885             }
4886         }
4887         else if (REGNODE_SIMPLE(OP(scan))) {
4888
4889             if (flags & SCF_DO_SUBSTR) {
4890                 scan_commit(pRExC_state, data, minlenp, is_inf);
4891                 data->pos_min++;
4892             }
4893             min++;
4894             if (flags & SCF_DO_STCLASS) {
4895                 bool invert = 0;
4896                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4897                 U8 namedclass;
4898
4899                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4900                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4901
4902                 /* Some of the logic below assumes that switching
4903                    locale on will only add false positives. */
4904                 switch (OP(scan)) {
4905
4906                 default:
4907 #ifdef DEBUGGING
4908                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4909                                                                      OP(scan));
4910 #endif
4911                 case CANY:
4912                 case SANY:
4913                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4914                         ssc_match_all_cp(data->start_class);
4915                     break;
4916
4917                 case REG_ANY:
4918                     {
4919                         SV* REG_ANY_invlist = _new_invlist(2);
4920                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4921                                                             '\n');
4922                         if (flags & SCF_DO_STCLASS_OR) {
4923                             ssc_union(data->start_class,
4924                                       REG_ANY_invlist,
4925                                       TRUE /* TRUE => invert, hence all but \n
4926                                             */
4927                                       );
4928                         }
4929                         else if (flags & SCF_DO_STCLASS_AND) {
4930                             ssc_intersection(data->start_class,
4931                                              REG_ANY_invlist,
4932                                              TRUE  /* TRUE => invert */
4933                                              );
4934                             ssc_clear_locale(data->start_class);
4935                         }
4936                         SvREFCNT_dec_NN(REG_ANY_invlist);
4937                     }
4938                     break;
4939
4940                 case ANYOF:
4941                     if (flags & SCF_DO_STCLASS_AND)
4942                         ssc_and(pRExC_state, data->start_class,
4943                                 (regnode_charclass *) scan);
4944                     else
4945                         ssc_or(pRExC_state, data->start_class,
4946                                                           (regnode_charclass *) scan);
4947                     break;
4948
4949                 case NPOSIXL:
4950                     invert = 1;
4951                     /* FALLTHROUGH */
4952
4953                 case POSIXL:
4954                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4955                     if (flags & SCF_DO_STCLASS_AND) {
4956                         bool was_there = cBOOL(
4957                                           ANYOF_POSIXL_TEST(data->start_class,
4958                                                                  namedclass));
4959                         ANYOF_POSIXL_ZERO(data->start_class);
4960                         if (was_there) {    /* Do an AND */
4961                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4962                         }
4963                         /* No individual code points can now match */
4964                         data->start_class->invlist
4965                                                 = sv_2mortal(_new_invlist(0));
4966                     }
4967                     else {
4968                         int complement = namedclass + ((invert) ? -1 : 1);
4969
4970                         assert(flags & SCF_DO_STCLASS_OR);
4971
4972                         /* If the complement of this class was already there,
4973                          * the result is that they match all code points,
4974                          * (\d + \D == everything).  Remove the classes from
4975                          * future consideration.  Locale is not relevant in
4976                          * this case */
4977                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4978                             ssc_match_all_cp(data->start_class);
4979                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4980                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4981                         }
4982                         else {  /* The usual case; just add this class to the
4983                                    existing set */
4984                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4985                         }
4986                     }
4987                     break;
4988
4989                 case NPOSIXA:   /* For these, we always know the exact set of
4990                                    what's matched */
4991                     invert = 1;
4992                     /* FALLTHROUGH */
4993                 case POSIXA:
4994                     if (FLAGS(scan) == _CC_ASCII) {
4995                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4996                     }
4997                     else {
4998                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4999                                               PL_XPosix_ptrs[_CC_ASCII],
5000                                               &my_invlist);
5001                     }
5002                     goto join_posix;
5003
5004                 case NPOSIXD:
5005                 case NPOSIXU:
5006                     invert = 1;
5007                     /* FALLTHROUGH */
5008                 case POSIXD:
5009                 case POSIXU:
5010                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5011
5012                     /* NPOSIXD matches all upper Latin1 code points unless the
5013                      * target string being matched is UTF-8, which is
5014                      * unknowable until match time.  Since we are going to
5015                      * invert, we want to get rid of all of them so that the
5016                      * inversion will match all */
5017                     if (OP(scan) == NPOSIXD) {
5018                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5019                                           &my_invlist);
5020                     }
5021
5022                   join_posix:
5023
5024                     if (flags & SCF_DO_STCLASS_AND) {
5025                         ssc_intersection(data->start_class, my_invlist, invert);
5026                         ssc_clear_locale(data->start_class);
5027                     }
5028                     else {
5029                         assert(flags & SCF_DO_STCLASS_OR);
5030                         ssc_union(data->start_class, my_invlist, invert);
5031                     }
5032                 }
5033                 if (flags & SCF_DO_STCLASS_OR)
5034                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5035                 flags &= ~SCF_DO_STCLASS;
5036             }
5037         }
5038         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5039             data->flags |= (OP(scan) == MEOL
5040                             ? SF_BEFORE_MEOL
5041                             : SF_BEFORE_SEOL);
5042             scan_commit(pRExC_state, data, minlenp, is_inf);
5043
5044         }
5045         else if (  PL_regkind[OP(scan)] == BRANCHJ
5046                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5047                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5048                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
5049             if ( OP(scan) == UNLESSM &&
5050                  scan->flags == 0 &&
5051                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5052                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5053             ) {
5054                 regnode *opt;
5055                 regnode *upto= regnext(scan);
5056                 DEBUG_PARSE_r({
5057                     SV * const mysv_val=sv_newmortal();
5058                     DEBUG_STUDYDATA("OPFAIL",data,depth);
5059
5060                     /*DEBUG_PARSE_MSG("opfail");*/
5061                     regprop(RExC_rx, mysv_val, upto, NULL);
5062                     PerlIO_printf(Perl_debug_log,
5063                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5064                         SvPV_nolen_const(mysv_val),
5065                         (IV)REG_NODE_NUM(upto),
5066                         (IV)(upto - scan)
5067                     );
5068                 });
5069                 OP(scan) = OPFAIL;
5070                 NEXT_OFF(scan) = upto - scan;
5071                 for (opt= scan + 1; opt < upto ; opt++)
5072                     OP(opt) = OPTIMIZED;
5073                 scan= upto;
5074                 continue;
5075             }
5076             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5077                 || OP(scan) == UNLESSM )
5078             {
5079                 /* Negative Lookahead/lookbehind
5080                    In this case we can't do fixed string optimisation.
5081                 */
5082
5083                 SSize_t deltanext, minnext, fake = 0;
5084                 regnode *nscan;
5085                 regnode_ssc intrnl;
5086                 int f = 0;
5087
5088                 data_fake.flags = 0;
5089                 if (data) {
5090                     data_fake.whilem_c = data->whilem_c;
5091                     data_fake.last_closep = data->last_closep;
5092                 }
5093                 else
5094                     data_fake.last_closep = &fake;
5095                 data_fake.pos_delta = delta;
5096                 if ( flags & SCF_DO_STCLASS && !scan->flags
5097                      && OP(scan) == IFMATCH ) { /* Lookahead */
5098                     ssc_init(pRExC_state, &intrnl);
5099                     data_fake.start_class = &intrnl;
5100                     f |= SCF_DO_STCLASS_AND;
5101                 }
5102                 if (flags & SCF_WHILEM_VISITED_POS)
5103                     f |= SCF_WHILEM_VISITED_POS;
5104                 next = regnext(scan);
5105                 nscan = NEXTOPER(NEXTOPER(scan));
5106                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5107                                       last, &data_fake, stopparen,
5108                                       recursed_depth, NULL, f, depth+1);
5109                 if (scan->flags) {
5110                     if (deltanext) {
5111                         FAIL("Variable length lookbehind not implemented");
5112                     }
5113                     else if (minnext > (I32)U8_MAX) {
5114                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5115                               (UV)U8_MAX);
5116                     }
5117                     scan->flags = (U8)minnext;
5118                 }
5119                 if (data) {
5120                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5121                         pars++;
5122                     if (data_fake.flags & SF_HAS_EVAL)
5123                         data->flags |= SF_HAS_EVAL;
5124                     data->whilem_c = data_fake.whilem_c;
5125                 }
5126                 if (f & SCF_DO_STCLASS_AND) {
5127                     if (flags & SCF_DO_STCLASS_OR) {
5128                         /* OR before, AND after: ideally we would recurse with
5129                          * data_fake to get the AND applied by study of the
5130                          * remainder of the pattern, and then derecurse;
5131                          * *** HACK *** for now just treat as "no information".
5132                          * See [perl #56690].
5133                          */
5134                         ssc_init(pRExC_state, data->start_class);
5135                     }  else {
5136                         /* AND before and after: combine and continue */
5137                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5138                     }
5139                 }
5140             }
5141 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5142             else {
5143                 /* Positive Lookahead/lookbehind
5144                    In this case we can do fixed string optimisation,
5145                    but we must be careful about it. Note in the case of
5146                    lookbehind the positions will be offset by the minimum
5147                    length of the pattern, something we won't know about
5148                    until after the recurse.
5149                 */
5150                 SSize_t deltanext, fake = 0;
5151                 regnode *nscan;
5152                 regnode_ssc intrnl;
5153                 int f = 0;
5154                 /* We use SAVEFREEPV so that when the full compile
5155                     is finished perl will clean up the allocated
5156                     minlens when it's all done. This way we don't
5157                     have to worry about freeing them when we know
5158                     they wont be used, which would be a pain.
5159                  */
5160                 SSize_t *minnextp;
5161                 Newx( minnextp, 1, SSize_t );
5162                 SAVEFREEPV(minnextp);
5163
5164                 if (data) {
5165                     StructCopy(data, &data_fake, scan_data_t);
5166                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5167                         f |= SCF_DO_SUBSTR;
5168                         if (scan->flags)
5169                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5170                         data_fake.last_found=newSVsv(data->last_found);
5171                     }
5172                 }
5173                 else
5174                     data_fake.last_closep = &fake;
5175                 data_fake.flags = 0;
5176                 data_fake.pos_delta = delta;
5177                 if (is_inf)
5178                     data_fake.flags |= SF_IS_INF;
5179                 if ( flags & SCF_DO_STCLASS && !scan->flags
5180                      && OP(scan) == IFMATCH ) { /* Lookahead */
5181                     ssc_init(pRExC_state, &intrnl);
5182                     data_fake.start_class = &intrnl;
5183                     f |= SCF_DO_STCLASS_AND;
5184                 }
5185                 if (flags & SCF_WHILEM_VISITED_POS)
5186                     f |= SCF_WHILEM_VISITED_POS;
5187                 next = regnext(scan);
5188                 nscan = NEXTOPER(NEXTOPER(scan));
5189
5190                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5191                                         &deltanext, last, &data_fake,
5192                                         stopparen, recursed_depth, NULL,
5193                                         f,depth+1);
5194                 if (scan->flags) {
5195                     if (deltanext) {
5196                         FAIL("Variable length lookbehind not implemented");
5197                     }
5198                     else if (*minnextp > (I32)U8_MAX) {
5199                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5200                               (UV)U8_MAX);
5201                     }
5202                     scan->flags = (U8)*minnextp;
5203                 }
5204
5205                 *minnextp += min;
5206
5207                 if (f & SCF_DO_STCLASS_AND) {
5208                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5209                 }
5210                 if (data) {
5211                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5212                         pars++;
5213                     if (data_fake.flags & SF_HAS_EVAL)
5214                         data->flags |= SF_HAS_EVAL;
5215                     data->whilem_c = data_fake.whilem_c;
5216                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5217                         if (RExC_rx->minlen<*minnextp)
5218                             RExC_rx->minlen=*minnextp;
5219                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5220                         SvREFCNT_dec_NN(data_fake.last_found);
5221
5222                         if ( data_fake.minlen_fixed != minlenp )
5223                         {
5224                             data->offset_fixed= data_fake.offset_fixed;
5225                             data->minlen_fixed= data_fake.minlen_fixed;
5226                             data->lookbehind_fixed+= scan->flags;
5227                         }
5228                         if ( data_fake.minlen_float != minlenp )
5229                         {
5230                             data->minlen_float= data_fake.minlen_float;
5231                             data->offset_float_min=data_fake.offset_float_min;
5232                             data->offset_float_max=data_fake.offset_float_max;
5233                             data->lookbehind_float+= scan->flags;
5234                         }
5235                     }
5236                 }
5237             }
5238 #endif
5239         }
5240         else if (OP(scan) == OPEN) {
5241             if (stopparen != (I32)ARG(scan))
5242                 pars++;
5243         }
5244         else if (OP(scan) == CLOSE) {
5245             if (stopparen == (I32)ARG(scan)) {
5246                 break;
5247             }
5248             if ((I32)ARG(scan) == is_par) {
5249                 next = regnext(scan);
5250
5251                 if ( next && (OP(next) != WHILEM) && next < last)
5252                     is_par = 0;         /* Disable optimization */
5253             }
5254             if (data)
5255                 *(data->last_closep) = ARG(scan);
5256         }
5257         else if (OP(scan) == EVAL) {
5258                 if (data)
5259                     data->flags |= SF_HAS_EVAL;
5260         }
5261         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5262             if (flags & SCF_DO_SUBSTR) {
5263                 scan_commit(pRExC_state, data, minlenp, is_inf);
5264                 flags &= ~SCF_DO_SUBSTR;
5265             }
5266             if (data && OP(scan)==ACCEPT) {
5267                 data->flags |= SCF_SEEN_ACCEPT;
5268                 if (stopmin > min)
5269                     stopmin = min;
5270             }
5271         }
5272         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5273         {
5274                 if (flags & SCF_DO_SUBSTR) {
5275                     scan_commit(pRExC_state, data, minlenp, is_inf);
5276                     data->longest = &(data->longest_float);
5277                 }
5278                 is_inf = is_inf_internal = 1;
5279                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5280                     ssc_anything(data->start_class);
5281                 flags &= ~SCF_DO_STCLASS;
5282         }
5283         else if (OP(scan) == GPOS) {
5284             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5285                 !(delta || is_inf || (data && data->pos_delta)))
5286             {
5287                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5288                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5289                 if (RExC_rx->gofs < (STRLEN)min)
5290                     RExC_rx->gofs = min;
5291             } else {
5292                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5293                 RExC_rx->gofs = 0;
5294             }
5295         }
5296 #ifdef TRIE_STUDY_OPT
5297 #ifdef FULL_TRIE_STUDY
5298         else if (PL_regkind[OP(scan)] == TRIE) {
5299             /* NOTE - There is similar code to this block above for handling
5300                BRANCH nodes on the initial study.  If you change stuff here
5301                check there too. */
5302             regnode *trie_node= scan;
5303             regnode *tail= regnext(scan);
5304             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5305             SSize_t max1 = 0, min1 = SSize_t_MAX;
5306             regnode_ssc accum;
5307
5308             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5309                 /* Cannot merge strings after this. */
5310                 scan_commit(pRExC_state, data, minlenp, is_inf);
5311             }
5312             if (flags & SCF_DO_STCLASS)
5313                 ssc_init_zero(pRExC_state, &accum);
5314
5315             if (!trie->jump) {
5316                 min1= trie->minlen;
5317                 max1= trie->maxlen;
5318             } else {
5319                 const regnode *nextbranch= NULL;
5320                 U32 word;
5321
5322                 for ( word=1 ; word <= trie->wordcount ; word++)
5323                 {
5324                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5325                     regnode_ssc this_class;
5326
5327                     data_fake.flags = 0;
5328                     if (data) {
5329                         data_fake.whilem_c = data->whilem_c;
5330                         data_fake.last_closep = data->last_closep;
5331                     }
5332                     else
5333                         data_fake.last_closep = &fake;
5334                     data_fake.pos_delta = delta;
5335                     if (flags & SCF_DO_STCLASS) {
5336                         ssc_init(pRExC_state, &this_class);
5337                         data_fake.start_class = &this_class;
5338                         f = SCF_DO_STCLASS_AND;
5339                     }
5340                     if (flags & SCF_WHILEM_VISITED_POS)
5341                         f |= SCF_WHILEM_VISITED_POS;
5342
5343                     if (trie->jump[word]) {
5344                         if (!nextbranch)
5345                             nextbranch = trie_node + trie->jump[0];
5346                         scan= trie_node + trie->jump[word];
5347                         /* We go from the jump point to the branch that follows
5348                            it. Note this means we need the vestigal unused
5349                            branches even though they arent otherwise used. */
5350                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5351                             &deltanext, (regnode *)nextbranch, &data_fake,
5352                             stopparen, recursed_depth, NULL, f,depth+1);
5353                     }
5354                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5355                         nextbranch= regnext((regnode*)nextbranch);
5356
5357                     if (min1 > (SSize_t)(minnext + trie->minlen))
5358                         min1 = minnext + trie->minlen;
5359                     if (deltanext == SSize_t_MAX) {
5360                         is_inf = is_inf_internal = 1;
5361                         max1 = SSize_t_MAX;
5362                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5363                         max1 = minnext + deltanext + trie->maxlen;
5364
5365                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5366                         pars++;
5367                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5368                         if ( stopmin > min + min1)
5369                             stopmin = min + min1;
5370                         flags &= ~SCF_DO_SUBSTR;
5371                         if (data)
5372                             data->flags |= SCF_SEEN_ACCEPT;
5373                     }
5374                     if (data) {
5375                         if (data_fake.flags & SF_HAS_EVAL)
5376                             data->flags |= SF_HAS_EVAL;
5377                         data->whilem_c = data_fake.whilem_c;
5378                     }
5379                     if (flags & SCF_DO_STCLASS)
5380                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5381                 }
5382             }
5383             if (flags & SCF_DO_SUBSTR) {
5384                 data->pos_min += min1;
5385                 data->pos_delta += max1 - min1;
5386                 if (max1 != min1 || is_inf)
5387                     data->longest = &(data->longest_float);
5388             }
5389             min += min1;
5390             delta += max1 - min1;
5391             if (flags & SCF_DO_STCLASS_OR) {
5392                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5393                 if (min1) {
5394                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5395                     flags &= ~SCF_DO_STCLASS;
5396                 }
5397             }
5398             else if (flags & SCF_DO_STCLASS_AND) {
5399                 if (min1) {
5400                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5401                     flags &= ~SCF_DO_STCLASS;
5402                 }
5403                 else {
5404                     /* Switch to OR mode: cache the old value of
5405                      * data->start_class */
5406                     INIT_AND_WITHP;
5407                     StructCopy(data->start_class, and_withp, regnode_ssc);
5408                     flags &= ~SCF_DO_STCLASS_AND;
5409                     StructCopy(&accum, data->start_class, regnode_ssc);
5410                     flags |= SCF_DO_STCLASS_OR;
5411                 }
5412             }
5413             scan= tail;
5414             continue;
5415         }
5416 #else
5417         else if (PL_regkind[OP(scan)] == TRIE) {
5418             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5419             U8*bang=NULL;
5420
5421             min += trie->minlen;
5422             delta += (trie->maxlen - trie->minlen);
5423             flags &= ~SCF_DO_STCLASS; /* xxx */
5424             if (flags & SCF_DO_SUBSTR) {
5425                 /* Cannot expect anything... */
5426                 scan_commit(pRExC_state, data, minlenp, is_inf);
5427                 data->pos_min += trie->minlen;
5428                 data->pos_delta += (trie->maxlen - trie->minlen);
5429                 if (trie->maxlen != trie->minlen)
5430                     data->longest = &(data->longest_float);
5431             }
5432             if (trie->jump) /* no more substrings -- for now /grr*/
5433                flags &= ~SCF_DO_SUBSTR;
5434         }
5435 #endif /* old or new */
5436 #endif /* TRIE_STUDY_OPT */
5437
5438         /* Else: zero-length, ignore. */
5439         scan = regnext(scan);
5440     }
5441     /* If we are exiting a recursion we can unset its recursed bit
5442      * and allow ourselves to enter it again - no danger of an
5443      * infinite loop there.
5444     if (stopparen > -1 && recursed) {
5445         DEBUG_STUDYDATA("unset:", data,depth);
5446         PAREN_UNSET( recursed, stopparen);
5447     }
5448     */
5449     if (frame) {
5450         DEBUG_STUDYDATA("frame-end:",data,depth);
5451         DEBUG_PEEP("fend", scan, depth);
5452         /* restore previous context */
5453         last = frame->last;
5454         scan = frame->next;
5455         stopparen = frame->stop;
5456         recursed_depth = frame->prev_recursed_depth;
5457         depth = depth - 1;
5458
5459         frame = frame->prev;
5460         goto fake_study_recurse;
5461     }
5462
5463   finish:
5464     assert(!frame);
5465     DEBUG_STUDYDATA("pre-fin:",data,depth);
5466
5467     *scanp = scan;
5468     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5469
5470     if (flags & SCF_DO_SUBSTR && is_inf)
5471         data->pos_delta = SSize_t_MAX - data->pos_min;
5472     if (is_par > (I32)U8_MAX)
5473         is_par = 0;
5474     if (is_par && pars==1 && data) {
5475         data->flags |= SF_IN_PAR;
5476         data->flags &= ~SF_HAS_PAR;
5477     }
5478     else if (pars && data) {
5479         data->flags |= SF_HAS_PAR;
5480         data->flags &= ~SF_IN_PAR;
5481     }
5482     if (flags & SCF_DO_STCLASS_OR)
5483         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5484     if (flags & SCF_TRIE_RESTUDY)
5485         data->flags |=  SCF_TRIE_RESTUDY;
5486
5487     DEBUG_STUDYDATA("post-fin:",data,depth);
5488
5489     {
5490         SSize_t final_minlen= min < stopmin ? min : stopmin;
5491
5492         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5493             RExC_maxlen = final_minlen + delta;
5494         }
5495         return final_minlen;
5496     }
5497     /* not-reached */
5498 }
5499
5500 STATIC U32
5501 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5502 {
5503     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5504
5505     PERL_ARGS_ASSERT_ADD_DATA;
5506
5507     Renewc(RExC_rxi->data,
5508            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5509            char, struct reg_data);
5510     if(count)
5511         Renew(RExC_rxi->data->what, count + n, U8);
5512     else
5513         Newx(RExC_rxi->data->what, n, U8);
5514     RExC_rxi->data->count = count + n;
5515     Copy(s, RExC_rxi->data->what + count, n, U8);
5516     return count;
5517 }
5518
5519 /*XXX: todo make this not included in a non debugging perl, but appears to be
5520  * used anyway there, in 'use re' */
5521 #ifndef PERL_IN_XSUB_RE
5522 void
5523 Perl_reginitcolors(pTHX)
5524 {
5525     dVAR;
5526     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5527     if (s) {
5528         char *t = savepv(s);
5529         int i = 0;
5530         PL_colors[0] = t;
5531         while (++i < 6) {
5532             t = strchr(t, '\t');
5533             if (t) {
5534                 *t = '\0';
5535                 PL_colors[i] = ++t;
5536             }
5537             else
5538                 PL_colors[i] = t = (char *)"";
5539         }
5540     } else {
5541         int i = 0;
5542         while (i < 6)
5543             PL_colors[i++] = (char *)"";
5544     }
5545     PL_colorset = 1;
5546 }
5547 #endif
5548
5549
5550 #ifdef TRIE_STUDY_OPT
5551 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5552     STMT_START {                                            \
5553         if (                                                \
5554               (data.flags & SCF_TRIE_RESTUDY)               \
5555               && ! restudied++                              \
5556         ) {                                                 \
5557             dOsomething;                                    \
5558             goto reStudy;                                   \
5559         }                                                   \
5560     } STMT_END
5561 #else
5562 #define CHECK_RESTUDY_GOTO_butfirst
5563 #endif
5564
5565 /*
5566  * pregcomp - compile a regular expression into internal code
5567  *
5568  * Decides which engine's compiler to call based on the hint currently in
5569  * scope
5570  */
5571
5572 #ifndef PERL_IN_XSUB_RE
5573
5574 /* return the currently in-scope regex engine (or the default if none)  */
5575
5576 regexp_engine const *
5577 Perl_current_re_engine(pTHX)
5578 {
5579     dVAR;
5580
5581     if (IN_PERL_COMPILETIME) {
5582         HV * const table = GvHV(PL_hintgv);
5583         SV **ptr;
5584
5585         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5586             return &PL_core_reg_engine;
5587         ptr = hv_fetchs(table, "regcomp", FALSE);
5588         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5589             return &PL_core_reg_engine;
5590         return INT2PTR(regexp_engine*,SvIV(*ptr));
5591     }
5592     else {
5593         SV *ptr;
5594         if (!PL_curcop->cop_hints_hash)
5595             return &PL_core_reg_engine;
5596         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5597         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5598             return &PL_core_reg_engine;
5599         return INT2PTR(regexp_engine*,SvIV(ptr));
5600     }
5601 }
5602
5603
5604 REGEXP *
5605 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5606 {
5607     dVAR;
5608     regexp_engine const *eng = current_re_engine();
5609     GET_RE_DEBUG_FLAGS_DECL;
5610
5611     PERL_ARGS_ASSERT_PREGCOMP;
5612
5613     /* Dispatch a request to compile a regexp to correct regexp engine. */
5614     DEBUG_COMPILE_r({
5615         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5616                         PTR2UV(eng));
5617     });
5618     return CALLREGCOMP_ENG(eng, pattern, flags);
5619 }
5620 #endif
5621
5622 /* public(ish) entry point for the perl core's own regex compiling code.
5623  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5624  * pattern rather than a list of OPs, and uses the internal engine rather
5625  * than the current one */
5626
5627 REGEXP *
5628 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5629 {
5630     SV *pat = pattern; /* defeat constness! */
5631     PERL_ARGS_ASSERT_RE_COMPILE;
5632     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5633 #ifdef PERL_IN_XSUB_RE
5634                                 &my_reg_engine,
5635 #else
5636                                 &PL_core_reg_engine,
5637 #endif
5638                                 NULL, NULL, rx_flags, 0);
5639 }
5640
5641
5642 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5643  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5644  * point to the realloced string and length.
5645  *
5646  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5647  * stuff added */
5648
5649 static void
5650 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5651                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5652 {
5653     U8 *const src = (U8*)*pat_p;
5654     U8 *dst;
5655     int n=0;
5656     STRLEN s = 0, d = 0;
5657     bool do_end = 0;
5658     GET_RE_DEBUG_FLAGS_DECL;
5659
5660     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5661         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5662
5663     Newx(dst, *plen_p * 2 + 1, U8);
5664
5665     while (s < *plen_p) {
5666         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5667             dst[d]   = src[s];
5668         else {
5669             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5670             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5671         }
5672         if (n < num_code_blocks) {
5673             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5674                 pRExC_state->code_blocks[n].start = d;
5675                 assert(dst[d] == '(');
5676                 do_end = 1;
5677             }
5678             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5679                 pRExC_state->code_blocks[n].end = d;
5680                 assert(dst[d] == ')');
5681                 do_end = 0;
5682                 n++;
5683             }
5684         }
5685         s++;
5686         d++;
5687     }
5688     dst[d] = '\0';
5689     *plen_p = d;
5690     *pat_p = (char*) dst;
5691     SAVEFREEPV(*pat_p);
5692     RExC_orig_utf8 = RExC_utf8 = 1;
5693 }
5694
5695
5696
5697 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5698  * while recording any code block indices, and handling overloading,
5699  * nested qr// objects etc.  If pat is null, it will allocate a new
5700  * string, or just return the first arg, if there's only one.
5701  *
5702  * Returns the malloced/updated pat.
5703  * patternp and pat_count is the array of SVs to be concatted;
5704  * oplist is the optional list of ops that generated the SVs;
5705  * recompile_p is a pointer to a boolean that will be set if
5706  *   the regex will need to be recompiled.
5707  * delim, if non-null is an SV that will be inserted between each element
5708  */
5709
5710 static SV*
5711 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5712                 SV *pat, SV ** const patternp, int pat_count,
5713                 OP *oplist, bool *recompile_p, SV *delim)
5714 {
5715     SV **svp;
5716     int n = 0;
5717     bool use_delim = FALSE;
5718     bool alloced = FALSE;
5719
5720     /* if we know we have at least two args, create an empty string,
5721      * then concatenate args to that. For no args, return an empty string */
5722     if (!pat && pat_count != 1) {
5723         pat = newSVpvs("");
5724         SAVEFREESV(pat);
5725         alloced = TRUE;
5726     }
5727
5728     for (svp = patternp; svp < patternp + pat_count; svp++) {
5729         SV *sv;
5730         SV *rx  = NULL;
5731         STRLEN orig_patlen = 0;
5732         bool code = 0;
5733         SV *msv = use_delim ? delim : *svp;
5734         if (!msv) msv = &PL_sv_undef;
5735
5736         /* if we've got a delimiter, we go round the loop twice for each
5737          * svp slot (except the last), using the delimiter the second
5738          * time round */
5739         if (use_delim) {
5740             svp--;
5741             use_delim = FALSE;
5742         }
5743         else if (delim)
5744             use_delim = TRUE;
5745
5746         if (SvTYPE(msv) == SVt_PVAV) {
5747             /* we've encountered an interpolated array within
5748              * the pattern, e.g. /...@a..../. Expand the list of elements,
5749              * then recursively append elements.
5750              * The code in this block is based on S_pushav() */
5751
5752             AV *const av = (AV*)msv;
5753             const SSize_t maxarg = AvFILL(av) + 1;
5754             SV **array;
5755
5756             if (oplist) {
5757                 assert(oplist->op_type == OP_PADAV
5758                     || oplist->op_type == OP_RV2AV);
5759                 oplist = oplist->op_sibling;;
5760             }
5761
5762             if (SvRMAGICAL(av)) {
5763                 SSize_t i;
5764
5765                 Newx(array, maxarg, SV*);
5766                 SAVEFREEPV(array);
5767                 for (i=0; i < maxarg; i++) {
5768                     SV ** const svp = av_fetch(av, i, FALSE);
5769                     array[i] = svp ? *svp : &PL_sv_undef;
5770                 }
5771             }
5772             else
5773                 array = AvARRAY(av);
5774
5775             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5776                                 array, maxarg, NULL, recompile_p,
5777                                 /* $" */
5778                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5779
5780             continue;
5781         }
5782
5783
5784         /* we make the assumption here that each op in the list of
5785          * op_siblings maps to one SV pushed onto the stack,
5786          * except for code blocks, with have both an OP_NULL and
5787          * and OP_CONST.
5788          * This allows us to match up the list of SVs against the
5789          * list of OPs to find the next code block.
5790          *
5791          * Note that       PUSHMARK PADSV PADSV ..
5792          * is optimised to
5793          *                 PADRANGE PADSV  PADSV  ..
5794          * so the alignment still works. */
5795
5796         if (oplist) {
5797             if (oplist->op_type == OP_NULL
5798                 && (oplist->op_flags & OPf_SPECIAL))
5799             {
5800                 assert(n < pRExC_state->num_code_blocks);
5801                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5802                 pRExC_state->code_blocks[n].block = oplist;
5803                 pRExC_state->code_blocks[n].src_regex = NULL;
5804                 n++;
5805                 code = 1;
5806                 oplist = oplist->op_sibling; /* skip CONST */
5807                 assert(oplist);
5808             }
5809             oplist = oplist->op_sibling;;
5810         }
5811
5812         /* apply magic and QR overloading to arg */
5813
5814         SvGETMAGIC(msv);
5815         if (SvROK(msv) && SvAMAGIC(msv)) {
5816             SV *sv = AMG_CALLunary(msv, regexp_amg);
5817             if (sv) {
5818                 if (SvROK(sv))
5819                     sv = SvRV(sv);
5820                 if (SvTYPE(sv) != SVt_REGEXP)
5821                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5822                 msv = sv;
5823             }
5824         }
5825
5826         /* try concatenation overload ... */
5827         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5828                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5829         {
5830             sv_setsv(pat, sv);
5831             /* overloading involved: all bets are off over literal
5832              * code. Pretend we haven't seen it */
5833             pRExC_state->num_code_blocks -= n;
5834             n = 0;
5835         }
5836         else  {
5837             /* ... or failing that, try "" overload */
5838             while (SvAMAGIC(msv)
5839                     && (sv = AMG_CALLunary(msv, string_amg))
5840                     && sv != msv
5841                     &&  !(   SvROK(msv)
5842                           && SvROK(sv)
5843                           && SvRV(msv) == SvRV(sv))
5844             ) {
5845                 msv = sv;
5846                 SvGETMAGIC(msv);
5847             }
5848             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5849                 msv = SvRV(msv);
5850
5851             if (pat) {
5852                 /* this is a partially unrolled
5853                  *     sv_catsv_nomg(pat, msv);
5854                  * that allows us to adjust code block indices if
5855                  * needed */
5856                 STRLEN dlen;
5857                 char *dst = SvPV_force_nomg(pat, dlen);
5858                 orig_patlen = dlen;
5859                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5860                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5861                     sv_setpvn(pat, dst, dlen);
5862                     SvUTF8_on(pat);
5863                 }
5864                 sv_catsv_nomg(pat, msv);
5865                 rx = msv;
5866             }
5867             else
5868                 pat = msv;
5869
5870             if (code)
5871                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5872         }
5873
5874         /* extract any code blocks within any embedded qr//'s */
5875         if (rx && SvTYPE(rx) == SVt_REGEXP
5876             && RX_ENGINE((REGEXP*)rx)->op_comp)
5877         {
5878
5879             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5880             if (ri->num_code_blocks) {
5881                 int i;
5882                 /* the presence of an embedded qr// with code means
5883                  * we should always recompile: the text of the
5884                  * qr// may not have changed, but it may be a
5885                  * different closure than last time */
5886                 *recompile_p = 1;
5887                 Renew(pRExC_state->code_blocks,
5888                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5889                     struct reg_code_block);
5890                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5891
5892                 for (i=0; i < ri->num_code_blocks; i++) {
5893                     struct reg_code_block *src, *dst;
5894                     STRLEN offset =  orig_patlen
5895                         + ReANY((REGEXP *)rx)->pre_prefix;
5896                     assert(n < pRExC_state->num_code_blocks);
5897                     src = &ri->code_blocks[i];
5898                     dst = &pRExC_state->code_blocks[n];
5899                     dst->start      = src->start + offset;
5900                     dst->end        = src->end   + offset;
5901                     dst->block      = src->block;
5902                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5903                                             src->src_regex
5904                                                 ? src->src_regex
5905                                                 : (REGEXP*)rx);
5906                     n++;
5907                 }
5908             }
5909         }
5910     }
5911     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5912     if (alloced)
5913         SvSETMAGIC(pat);
5914
5915     return pat;
5916 }
5917
5918
5919
5920 /* see if there are any run-time code blocks in the pattern.
5921  * False positives are allowed */
5922
5923 static bool
5924 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5925                     char *pat, STRLEN plen)
5926 {
5927     int n = 0;
5928     STRLEN s;
5929
5930     PERL_UNUSED_CONTEXT;
5931
5932     for (s = 0; s < plen; s++) {
5933         if (n < pRExC_state->num_code_blocks
5934             && s == pRExC_state->code_blocks[n].start)
5935         {
5936             s = pRExC_state->code_blocks[n].end;
5937             n++;
5938             continue;
5939         }
5940         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5941          * positives here */
5942         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5943             (pat[s+2] == '{'
5944                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5945         )
5946             return 1;
5947     }
5948     return 0;
5949 }
5950
5951 /* Handle run-time code blocks. We will already have compiled any direct
5952  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5953  * copy of it, but with any literal code blocks blanked out and
5954  * appropriate chars escaped; then feed it into
5955  *
5956  *    eval "qr'modified_pattern'"
5957  *
5958  * For example,
5959  *
5960  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5961  *
5962  * becomes
5963  *
5964  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5965  *
5966  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5967  * and merge them with any code blocks of the original regexp.
5968  *
5969  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5970  * instead, just save the qr and return FALSE; this tells our caller that
5971  * the original pattern needs upgrading to utf8.
5972  */
5973
5974 static bool
5975 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5976     char *pat, STRLEN plen)
5977 {
5978     SV *qr;
5979
5980     GET_RE_DEBUG_FLAGS_DECL;
5981
5982     if (pRExC_state->runtime_code_qr) {
5983         /* this is the second time we've been called; this should
5984          * only happen if the main pattern got upgraded to utf8
5985          * during compilation; re-use the qr we compiled first time
5986          * round (which should be utf8 too)
5987          */
5988         qr = pRExC_state->runtime_code_qr;
5989         pRExC_state->runtime_code_qr = NULL;
5990         assert(RExC_utf8 && SvUTF8(qr));
5991     }
5992     else {
5993         int n = 0;
5994         STRLEN s;
5995         char *p, *newpat;
5996         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5997         SV *sv, *qr_ref;
5998         dSP;
5999
6000         /* determine how many extra chars we need for ' and \ escaping */
6001         for (s = 0; s < plen; s++) {
6002             if (pat[s] == '\'' || pat[s] == '\\')
6003                 newlen++;
6004         }
6005
6006         Newx(newpat, newlen, char);
6007         p = newpat;
6008         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6009
6010         for (s = 0; s < plen; s++) {
6011             if (n < pRExC_state->num_code_blocks
6012                 && s == pRExC_state->code_blocks[n].start)
6013             {
6014                 /* blank out literal code block */
6015                 assert(pat[s] == '(');
6016                 while (s <= pRExC_state->code_blocks[n].end) {
6017                     *p++ = '_';
6018                     s++;
6019                 }
6020                 s--;
6021                 n++;
6022                 continue;
6023             }
6024             if (pat[s] == '\'' || pat[s] == '\\')
6025                 *p++ = '\\';
6026             *p++ = pat[s];
6027         }
6028         *p++ = '\'';
6029         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6030             *p++ = 'x';
6031         *p++ = '\0';
6032         DEBUG_COMPILE_r({
6033             PerlIO_printf(Perl_debug_log,
6034                 "%sre-parsing pattern for runtime code:%s %s\n",
6035                 PL_colors[4],PL_colors[5],newpat);
6036         });
6037
6038         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6039         Safefree(newpat);
6040
6041         ENTER;
6042         SAVETMPS;
6043         save_re_context();
6044         PUSHSTACKi(PERLSI_REQUIRE);
6045         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6046          * parsing qr''; normally only q'' does this. It also alters
6047          * hints handling */
6048         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6049         SvREFCNT_dec_NN(sv);
6050         SPAGAIN;
6051         qr_ref = POPs;
6052         PUTBACK;
6053         {
6054             SV * const errsv = ERRSV;
6055             if (SvTRUE_NN(errsv))
6056             {
6057                 Safefree(pRExC_state->code_blocks);
6058                 /* use croak_sv ? */
6059                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6060             }
6061         }
6062         assert(SvROK(qr_ref));
6063         qr = SvRV(qr_ref);
6064         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6065         /* the leaving below frees the tmp qr_ref.
6066          * Give qr a life of its own */
6067         SvREFCNT_inc(qr);
6068         POPSTACK;
6069         FREETMPS;
6070         LEAVE;
6071
6072     }
6073
6074     if (!RExC_utf8 && SvUTF8(qr)) {
6075         /* first time through; the pattern got upgraded; save the
6076          * qr for the next time through */
6077         assert(!pRExC_state->runtime_code_qr);
6078         pRExC_state->runtime_code_qr = qr;
6079         return 0;
6080     }
6081
6082
6083     /* extract any code blocks within the returned qr//  */
6084
6085
6086     /* merge the main (r1) and run-time (r2) code blocks into one */
6087     {
6088         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6089         struct reg_code_block *new_block, *dst;
6090         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6091         int i1 = 0, i2 = 0;
6092
6093         if (!r2->num_code_blocks) /* we guessed wrong */
6094         {
6095             SvREFCNT_dec_NN(qr);
6096             return 1;
6097         }
6098
6099         Newx(new_block,
6100             r1->num_code_blocks + r2->num_code_blocks,
6101             struct reg_code_block);
6102         dst = new_block;
6103
6104         while (    i1 < r1->num_code_blocks
6105                 || i2 < r2->num_code_blocks)
6106         {
6107             struct reg_code_block *src;
6108             bool is_qr = 0;
6109
6110             if (i1 == r1->num_code_blocks) {
6111                 src = &r2->code_blocks[i2++];
6112                 is_qr = 1;
6113             }
6114             else if (i2 == r2->num_code_blocks)
6115                 src = &r1->code_blocks[i1++];
6116             else if (  r1->code_blocks[i1].start
6117                      < r2->code_blocks[i2].start)
6118             {
6119                 src = &r1->code_blocks[i1++];
6120                 assert(src->end < r2->code_blocks[i2].start);
6121             }
6122             else {
6123                 assert(  r1->code_blocks[i1].start
6124                        > r2->code_blocks[i2].start);
6125                 src = &r2->code_blocks[i2++];
6126                 is_qr = 1;
6127                 assert(src->end < r1->code_blocks[i1].start);
6128             }
6129
6130             assert(pat[src->start] == '(');
6131             assert(pat[src->end]   == ')');
6132             dst->start      = src->start;
6133             dst->end        = src->end;
6134             dst->block      = src->block;
6135             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6136                                     : src->src_regex;
6137             dst++;
6138         }
6139         r1->num_code_blocks += r2->num_code_blocks;
6140         Safefree(r1->code_blocks);
6141         r1->code_blocks = new_block;
6142     }
6143
6144     SvREFCNT_dec_NN(qr);
6145     return 1;
6146 }
6147
6148
6149 STATIC bool
6150 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6151                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6152                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6153                       STRLEN longest_length, bool eol, bool meol)
6154 {
6155     /* This is the common code for setting up the floating and fixed length
6156      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6157      * as to whether succeeded or not */
6158
6159     I32 t;
6160     SSize_t ml;
6161
6162     if (! (longest_length
6163            || (eol /* Can't have SEOL and MULTI */
6164                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6165           )
6166             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6167         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6168     {
6169         return FALSE;
6170     }
6171
6172     /* copy the information about the longest from the reg_scan_data
6173         over to the program. */
6174     if (SvUTF8(sv_longest)) {
6175         *rx_utf8 = sv_longest;
6176         *rx_substr = NULL;
6177     } else {
6178         *rx_substr = sv_longest;
6179         *rx_utf8 = NULL;
6180     }
6181     /* end_shift is how many chars that must be matched that
6182         follow this item. We calculate it ahead of time as once the
6183         lookbehind offset is added in we lose the ability to correctly
6184         calculate it.*/
6185     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6186     *rx_end_shift = ml - offset
6187         - longest_length + (SvTAIL(sv_longest) != 0)
6188         + lookbehind;
6189
6190     t = (eol/* Can't have SEOL and MULTI */
6191          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6192     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6193
6194     return TRUE;
6195 }
6196
6197 /*
6198  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6199  * regular expression into internal code.
6200  * The pattern may be passed either as:
6201  *    a list of SVs (patternp plus pat_count)
6202  *    a list of OPs (expr)
6203  * If both are passed, the SV list is used, but the OP list indicates
6204  * which SVs are actually pre-compiled code blocks
6205  *
6206  * The SVs in the list have magic and qr overloading applied to them (and
6207  * the list may be modified in-place with replacement SVs in the latter
6208  * case).
6209  *
6210  * If the pattern hasn't changed from old_re, then old_re will be
6211  * returned.
6212  *
6213  * eng is the current engine. If that engine has an op_comp method, then
6214  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6215  * do the initial concatenation of arguments and pass on to the external
6216  * engine.
6217  *
6218  * If is_bare_re is not null, set it to a boolean indicating whether the
6219  * arg list reduced (after overloading) to a single bare regex which has
6220  * been returned (i.e. /$qr/).
6221  *
6222  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6223  *
6224  * pm_flags contains the PMf_* flags, typically based on those from the
6225  * pm_flags field of the related PMOP. Currently we're only interested in
6226  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6227  *
6228  * We can't allocate space until we know how big the compiled form will be,
6229  * but we can't compile it (and thus know how big it is) until we've got a
6230  * place to put the code.  So we cheat:  we compile it twice, once with code
6231  * generation turned off and size counting turned on, and once "for real".
6232  * This also means that we don't allocate space until we are sure that the
6233  * thing really will compile successfully, and we never have to move the
6234  * code and thus invalidate pointers into it.  (Note that it has to be in
6235  * one piece because free() must be able to free it all.) [NB: not true in perl]
6236  *
6237  * Beware that the optimization-preparation code in here knows about some
6238  * of the structure of the compiled regexp.  [I'll say.]
6239  */
6240
6241 REGEXP *
6242 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6243                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6244                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6245 {
6246     dVAR;
6247     REGEXP *rx;
6248     struct regexp *r;
6249     regexp_internal *ri;
6250     STRLEN plen;
6251     char *exp;
6252     regnode *scan;
6253     I32 flags;
6254     SSize_t minlen = 0;
6255     U32 rx_flags;
6256     SV *pat;
6257     SV *code_blocksv = NULL;
6258     SV** new_patternp = patternp;
6259
6260     /* these are all flags - maybe they should be turned
6261      * into a single int with different bit masks */
6262     I32 sawlookahead = 0;
6263     I32 sawplus = 0;
6264     I32 sawopen = 0;
6265     I32 sawminmod = 0;
6266
6267     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6268     bool recompile = 0;
6269     bool runtime_code = 0;
6270     scan_data_t data;
6271     RExC_state_t RExC_state;
6272     RExC_state_t * const pRExC_state = &RExC_state;
6273 #ifdef TRIE_STUDY_OPT
6274     int restudied = 0;
6275     RExC_state_t copyRExC_state;
6276 #endif
6277     GET_RE_DEBUG_FLAGS_DECL;
6278
6279     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6280
6281     DEBUG_r(if (!PL_colorset) reginitcolors());
6282
6283 #ifndef PERL_IN_XSUB_RE
6284     /* Initialize these here instead of as-needed, as is quick and avoids
6285      * having to test them each time otherwise */
6286     if (! PL_AboveLatin1) {
6287         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6288         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6289         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6290         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6291         PL_HasMultiCharFold =
6292                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6293     }
6294 #endif
6295
6296     pRExC_state->code_blocks = NULL;
6297     pRExC_state->num_code_blocks = 0;
6298
6299     if (is_bare_re)
6300         *is_bare_re = FALSE;
6301
6302     if (expr && (expr->op_type == OP_LIST ||
6303                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6304         /* allocate code_blocks if needed */
6305         OP *o;
6306         int ncode = 0;
6307
6308         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6309             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6310                 ncode++; /* count of DO blocks */
6311         if (ncode) {
6312             pRExC_state->num_code_blocks = ncode;
6313             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6314         }
6315     }
6316
6317     if (!pat_count) {
6318         /* compile-time pattern with just OP_CONSTs and DO blocks */
6319
6320         int n;
6321         OP *o;
6322
6323         /* find how many CONSTs there are */
6324         assert(expr);
6325         n = 0;
6326         if (expr->op_type == OP_CONST)
6327             n = 1;
6328         else
6329             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6330                 if (o->op_type == OP_CONST)
6331                     n++;
6332             }
6333
6334         /* fake up an SV array */
6335
6336         assert(!new_patternp);
6337         Newx(new_patternp, n, SV*);
6338         SAVEFREEPV(new_patternp);
6339         pat_count = n;
6340
6341         n = 0;
6342         if (expr->op_type == OP_CONST)
6343             new_patternp[n] = cSVOPx_sv(expr);
6344         else
6345             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6346                 if (o->op_type == OP_CONST)
6347                     new_patternp[n++] = cSVOPo_sv;
6348             }
6349
6350     }
6351
6352     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6353         "Assembling pattern from %d elements%s\n", pat_count,
6354             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6355
6356     /* set expr to the first arg op */
6357
6358     if (pRExC_state->num_code_blocks
6359          && expr->op_type != OP_CONST)
6360     {
6361             expr = cLISTOPx(expr)->op_first;
6362             assert(   expr->op_type == OP_PUSHMARK
6363                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6364                    || expr->op_type == OP_PADRANGE);
6365             expr = expr->op_sibling;
6366     }
6367
6368     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6369                         expr, &recompile, NULL);
6370
6371     /* handle bare (possibly after overloading) regex: foo =~ $re */
6372     {
6373         SV *re = pat;
6374         if (SvROK(re))
6375             re = SvRV(re);
6376         if (SvTYPE(re) == SVt_REGEXP) {
6377             if (is_bare_re)
6378                 *is_bare_re = TRUE;
6379             SvREFCNT_inc(re);
6380             Safefree(pRExC_state->code_blocks);
6381             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6382                 "Precompiled pattern%s\n",
6383                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6384
6385             return (REGEXP*)re;
6386         }
6387     }
6388
6389     exp = SvPV_nomg(pat, plen);
6390
6391     if (!eng->op_comp) {
6392         if ((SvUTF8(pat) && IN_BYTES)
6393                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6394         {
6395             /* make a temporary copy; either to convert to bytes,
6396              * or to avoid repeating get-magic / overloaded stringify */
6397             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6398                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6399         }
6400         Safefree(pRExC_state->code_blocks);
6401         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6402     }
6403
6404     /* ignore the utf8ness if the pattern is 0 length */
6405     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6406     RExC_uni_semantics = 0;
6407     RExC_contains_locale = 0;
6408     RExC_contains_i = 0;
6409     pRExC_state->runtime_code_qr = NULL;
6410
6411     DEBUG_COMPILE_r({
6412             SV *dsv= sv_newmortal();
6413             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6414             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6415                           PL_colors[4],PL_colors[5],s);
6416         });
6417
6418   redo_first_pass:
6419     /* we jump here if we upgrade the pattern to utf8 and have to
6420      * recompile */
6421
6422     if ((pm_flags & PMf_USE_RE_EVAL)
6423                 /* this second condition covers the non-regex literal case,
6424                  * i.e.  $foo =~ '(?{})'. */
6425                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6426     )
6427         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6428
6429     /* return old regex if pattern hasn't changed */
6430     /* XXX: note in the below we have to check the flags as well as the
6431      * pattern.
6432      *
6433      * Things get a touch tricky as we have to compare the utf8 flag
6434      * independently from the compile flags.  */
6435
6436     if (   old_re
6437         && !recompile
6438         && !!RX_UTF8(old_re) == !!RExC_utf8
6439         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6440         && RX_PRECOMP(old_re)
6441         && RX_PRELEN(old_re) == plen
6442         && memEQ(RX_PRECOMP(old_re), exp, plen)
6443         && !runtime_code /* with runtime code, always recompile */ )
6444     {
6445         Safefree(pRExC_state->code_blocks);
6446         return old_re;
6447     }
6448
6449     rx_flags = orig_rx_flags;
6450
6451     if (rx_flags & PMf_FOLD) {
6452         RExC_contains_i = 1;
6453     }
6454     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6455
6456         /* Set to use unicode semantics if the pattern is in utf8 and has the
6457          * 'depends' charset specified, as it means unicode when utf8  */
6458         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6459     }
6460
6461     RExC_precomp = exp;
6462     RExC_flags = rx_flags;
6463     RExC_pm_flags = pm_flags;
6464
6465     if (runtime_code) {
6466         if (TAINTING_get && TAINT_get)
6467             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6468
6469         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6470             /* whoops, we have a non-utf8 pattern, whilst run-time code
6471              * got compiled as utf8. Try again with a utf8 pattern */
6472             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6473                                     pRExC_state->num_code_blocks);
6474             goto redo_first_pass;
6475         }
6476     }
6477     assert(!pRExC_state->runtime_code_qr);
6478
6479     RExC_sawback = 0;
6480
6481     RExC_seen = 0;
6482     RExC_maxlen = 0;
6483     RExC_in_lookbehind = 0;
6484     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6485     RExC_extralen = 0;
6486     RExC_override_recoding = 0;
6487     RExC_in_multi_char_class = 0;
6488
6489     /* First pass: determine size, legality. */
6490     RExC_parse = exp;
6491     RExC_start = exp;
6492     RExC_end = exp + plen;
6493     RExC_naughty = 0;
6494     RExC_npar = 1;
6495     RExC_nestroot = 0;
6496     RExC_size = 0L;
6497     RExC_emit = (regnode *) &RExC_emit_dummy;
6498     RExC_whilem_seen = 0;
6499     RExC_open_parens = NULL;
6500     RExC_close_parens = NULL;
6501     RExC_opend = NULL;
6502     RExC_paren_names = NULL;
6503 #ifdef DEBUGGING
6504     RExC_paren_name_list = NULL;
6505 #endif
6506     RExC_recurse = NULL;
6507     RExC_study_chunk_recursed = NULL;
6508     RExC_study_chunk_recursed_bytes= 0;
6509     RExC_recurse_count = 0;
6510     pRExC_state->code_index = 0;
6511
6512 #if 0 /* REGC() is (currently) a NOP at the first pass.
6513        * Clever compilers notice this and complain. --jhi */
6514     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6515 #endif
6516     DEBUG_PARSE_r(
6517         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6518         RExC_lastnum=0;
6519         RExC_lastparse=NULL;
6520     );
6521     /* reg may croak on us, not giving us a chance to free
6522        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6523        need it to survive as long as the regexp (qr/(?{})/).
6524        We must check that code_blocksv is not already set, because we may
6525        have jumped back to restart the sizing pass. */
6526     if (pRExC_state->code_blocks && !code_blocksv) {
6527         code_blocksv = newSV_type(SVt_PV);
6528         SAVEFREESV(code_blocksv);
6529         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6530         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6531     }
6532     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6533         /* It's possible to write a regexp in ascii that represents Unicode
6534         codepoints outside of the byte range, such as via \x{100}. If we
6535         detect such a sequence we have to convert the entire pattern to utf8
6536         and then recompile, as our sizing calculation will have been based
6537         on 1 byte == 1 character, but we will need to use utf8 to encode
6538         at least some part of the pattern, and therefore must convert the whole
6539         thing.
6540         -- dmq */
6541         if (flags & RESTART_UTF8) {
6542             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6543                                     pRExC_state->num_code_blocks);
6544             goto redo_first_pass;
6545         }
6546         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6547     }
6548     if (code_blocksv)
6549         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6550
6551     DEBUG_PARSE_r({
6552         PerlIO_printf(Perl_debug_log,
6553             "Required size %"IVdf" nodes\n"
6554             "Starting second pass (creation)\n",
6555             (IV)RExC_size);
6556         RExC_lastnum=0;
6557         RExC_lastparse=NULL;
6558     });
6559
6560     /* The first pass could have found things that force Unicode semantics */
6561     if ((RExC_utf8 || RExC_uni_semantics)
6562          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6563     {
6564         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6565     }
6566
6567     /* Small enough for pointer-storage convention?
6568        If extralen==0, this means that we will not need long jumps. */
6569     if (RExC_size >= 0x10000L && RExC_extralen)
6570         RExC_size += RExC_extralen;
6571     else
6572         RExC_extralen = 0;
6573     if (RExC_whilem_seen > 15)
6574         RExC_whilem_seen = 15;
6575
6576     /* Allocate space and zero-initialize. Note, the two step process
6577        of zeroing when in debug mode, thus anything assigned has to
6578        happen after that */
6579     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6580     r = ReANY(rx);
6581     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6582          char, regexp_internal);
6583     if ( r == NULL || ri == NULL )
6584         FAIL("Regexp out of space");
6585 #ifdef DEBUGGING
6586     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6587     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6588          char);
6589 #else
6590     /* bulk initialize base fields with 0. */
6591     Zero(ri, sizeof(regexp_internal), char);
6592 #endif
6593
6594     /* non-zero initialization begins here */
6595     RXi_SET( r, ri );
6596     r->engine= eng;
6597     r->extflags = rx_flags;
6598     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6599
6600     if (pm_flags & PMf_IS_QR) {
6601         ri->code_blocks = pRExC_state->code_blocks;
6602         ri->num_code_blocks = pRExC_state->num_code_blocks;
6603     }
6604     else
6605     {
6606         int n;
6607         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6608             if (pRExC_state->code_blocks[n].src_regex)
6609                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6610         SAVEFREEPV(pRExC_state->code_blocks);
6611     }
6612
6613     {
6614         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6615         bool has_charset = (get_regex_charset(r->extflags)
6616                                                     != REGEX_DEPENDS_CHARSET);
6617
6618         /* The caret is output if there are any defaults: if not all the STD
6619          * flags are set, or if no character set specifier is needed */
6620         bool has_default =
6621                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6622                     || ! has_charset);
6623         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6624                                                    == REG_RUN_ON_COMMENT_SEEN);
6625         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6626                             >> RXf_PMf_STD_PMMOD_SHIFT);
6627         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6628         char *p;
6629         /* Allocate for the worst case, which is all the std flags are turned
6630          * on.  If more precision is desired, we could do a population count of
6631          * the flags set.  This could be done with a small lookup table, or by
6632          * shifting, masking and adding, or even, when available, assembly
6633          * language for a machine-language population count.
6634          * We never output a minus, as all those are defaults, so are
6635          * covered by the caret */
6636         const STRLEN wraplen = plen + has_p + has_runon
6637             + has_default       /* If needs a caret */
6638
6639                 /* If needs a character set specifier */
6640             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6641             + (sizeof(STD_PAT_MODS) - 1)
6642             + (sizeof("(?:)") - 1);
6643
6644         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6645         r->xpv_len_u.xpvlenu_pv = p;
6646         if (RExC_utf8)
6647             SvFLAGS(rx) |= SVf_UTF8;
6648         *p++='('; *p++='?';
6649
6650         /* If a default, cover it using the caret */
6651         if (has_default) {
6652             *p++= DEFAULT_PAT_MOD;
6653         }
6654         if (has_charset) {
6655             STRLEN len;
6656             const char* const name = get_regex_charset_name(r->extflags, &len);
6657             Copy(name, p, len, char);
6658             p += len;
6659         }
6660         if (has_p)
6661             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6662         {
6663             char ch;
6664             while((ch = *fptr++)) {
6665                 if(reganch & 1)
6666                     *p++ = ch;
6667                 reganch >>= 1;
6668             }
6669         }
6670
6671         *p++ = ':';
6672         Copy(RExC_precomp, p, plen, char);
6673         assert ((RX_WRAPPED(rx) - p) < 16);
6674         r->pre_prefix = p - RX_WRAPPED(rx);
6675         p += plen;
6676         if (has_runon)
6677             *p++ = '\n';
6678         *p++ = ')';
6679         *p = 0;
6680         SvCUR_set(rx, p - RX_WRAPPED(rx));
6681     }
6682
6683     r->intflags = 0;
6684     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6685
6686     /* setup various meta data about recursion, this all requires
6687      * RExC_npar to be correctly set, and a bit later on we clear it */
6688     if (RExC_seen & REG_RECURSE_SEEN) {
6689         Newxz(RExC_open_parens, RExC_npar,regnode *);
6690         SAVEFREEPV(RExC_open_parens);
6691         Newxz(RExC_close_parens,RExC_npar,regnode *);
6692         SAVEFREEPV(RExC_close_parens);
6693     }
6694     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6695         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6696          * So its 1 if there are no parens. */
6697         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6698                                          ((RExC_npar & 0x07) != 0);
6699         Newx(RExC_study_chunk_recursed,
6700              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6701         SAVEFREEPV(RExC_study_chunk_recursed);
6702     }
6703
6704     /* Useful during FAIL. */
6705 #ifdef RE_TRACK_PATTERN_OFFSETS
6706     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6707     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6708                           "%s %"UVuf" bytes for offset annotations.\n",
6709                           ri->u.offsets ? "Got" : "Couldn't get",
6710                           (UV)((2*RExC_size+1) * sizeof(U32))));
6711 #endif
6712     SetProgLen(ri,RExC_size);
6713     RExC_rx_sv = rx;
6714     RExC_rx = r;
6715     RExC_rxi = ri;
6716
6717     /* Second pass: emit code. */
6718     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6719     RExC_pm_flags = pm_flags;
6720     RExC_parse = exp;
6721     RExC_end = exp + plen;
6722     RExC_naughty = 0;
6723     RExC_npar = 1;
6724     RExC_emit_start = ri->program;
6725     RExC_emit = ri->program;
6726     RExC_emit_bound = ri->program + RExC_size + 1;
6727     pRExC_state->code_index = 0;
6728
6729     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6730     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6731         ReREFCNT_dec(rx);
6732         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6733     }
6734     /* XXXX To minimize changes to RE engine we always allocate
6735        3-units-long substrs field. */
6736     Newx(r->substrs, 1, struct reg_substr_data);
6737     if (RExC_recurse_count) {
6738         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6739         SAVEFREEPV(RExC_recurse);
6740     }
6741
6742 reStudy:
6743     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6744     Zero(r->substrs, 1, struct reg_substr_data);
6745     if (RExC_study_chunk_recursed)
6746         Zero(RExC_study_chunk_recursed,
6747              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6748
6749 #ifdef TRIE_STUDY_OPT
6750     if (!restudied) {
6751         StructCopy(&zero_scan_data, &data, scan_data_t);
6752         copyRExC_state = RExC_state;
6753     } else {
6754         U32 seen=RExC_seen;
6755         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6756
6757         RExC_state = copyRExC_state;
6758         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6759             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6760         else
6761             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6762         StructCopy(&zero_scan_data, &data, scan_data_t);
6763     }
6764 #else
6765     StructCopy(&zero_scan_data, &data, scan_data_t);
6766 #endif
6767
6768     /* Dig out information for optimizations. */
6769     r->extflags = RExC_flags; /* was pm_op */
6770     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6771
6772     if (UTF)
6773         SvUTF8_on(rx);  /* Unicode in it? */
6774     ri->regstclass = NULL;
6775     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6776         r->intflags |= PREGf_NAUGHTY;
6777     scan = ri->program + 1;             /* First BRANCH. */
6778
6779     /* testing for BRANCH here tells us whether there is "must appear"
6780        data in the pattern. If there is then we can use it for optimisations */
6781     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6782                                                   */
6783         SSize_t fake;
6784         STRLEN longest_float_length, longest_fixed_length;
6785         regnode_ssc ch_class; /* pointed to by data */
6786         int stclass_flag;
6787         SSize_t last_close = 0; /* pointed to by data */
6788         regnode *first= scan;
6789         regnode *first_next= regnext(first);
6790         /*
6791          * Skip introductions and multiplicators >= 1
6792          * so that we can extract the 'meat' of the pattern that must
6793          * match in the large if() sequence following.
6794          * NOTE that EXACT is NOT covered here, as it is normally
6795          * picked up by the optimiser separately.
6796          *
6797          * This is unfortunate as the optimiser isnt handling lookahead
6798          * properly currently.
6799          *
6800          */
6801         while ((OP(first) == OPEN && (sawopen = 1)) ||
6802                /* An OR of *one* alternative - should not happen now. */
6803             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6804             /* for now we can't handle lookbehind IFMATCH*/
6805             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6806             (OP(first) == PLUS) ||
6807             (OP(first) == MINMOD) ||
6808                /* An {n,m} with n>0 */
6809             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6810             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6811         {
6812                 /*
6813                  * the only op that could be a regnode is PLUS, all the rest
6814                  * will be regnode_1 or regnode_2.
6815                  *
6816                  * (yves doesn't think this is true)
6817                  */
6818                 if (OP(first) == PLUS)
6819                     sawplus = 1;
6820                 else {
6821                     if (OP(first) == MINMOD)
6822                         sawminmod = 1;
6823                     first += regarglen[OP(first)];
6824                 }
6825                 first = NEXTOPER(first);
6826                 first_next= regnext(first);
6827         }
6828
6829         /* Starting-point info. */
6830       again:
6831         DEBUG_PEEP("first:",first,0);
6832         /* Ignore EXACT as we deal with it later. */
6833         if (PL_regkind[OP(first)] == EXACT) {
6834             if (OP(first) == EXACT)
6835                 NOOP;   /* Empty, get anchored substr later. */
6836             else
6837                 ri->regstclass = first;
6838         }
6839 #ifdef TRIE_STCLASS
6840         else if (PL_regkind[OP(first)] == TRIE &&
6841                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6842         {
6843             /* this can happen only on restudy */
6844             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6845         }
6846 #endif
6847         else if (REGNODE_SIMPLE(OP(first)))
6848             ri->regstclass = first;
6849         else if (PL_regkind[OP(first)] == BOUND ||
6850                  PL_regkind[OP(first)] == NBOUND)
6851             ri->regstclass = first;
6852         else if (PL_regkind[OP(first)] == BOL) {
6853             r->intflags |= (OP(first) == MBOL
6854                            ? PREGf_ANCH_MBOL
6855                            : (OP(first) == SBOL
6856                               ? PREGf_ANCH_SBOL
6857                               : PREGf_ANCH_BOL));
6858             first = NEXTOPER(first);
6859             goto again;
6860         }
6861         else if (OP(first) == GPOS) {
6862             r->intflags |= PREGf_ANCH_GPOS;
6863             first = NEXTOPER(first);
6864             goto again;
6865         }
6866         else if ((!sawopen || !RExC_sawback) &&
6867             (OP(first) == STAR &&
6868             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6869             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6870         {
6871             /* turn .* into ^.* with an implied $*=1 */
6872             const int type =
6873                 (OP(NEXTOPER(first)) == REG_ANY)
6874                     ? PREGf_ANCH_MBOL
6875                     : PREGf_ANCH_SBOL;
6876             r->intflags |= (type | PREGf_IMPLICIT);
6877             first = NEXTOPER(first);
6878             goto again;
6879         }
6880         if (sawplus && !sawminmod && !sawlookahead
6881             && (!sawopen || !RExC_sawback)
6882             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6883             /* x+ must match at the 1st pos of run of x's */
6884             r->intflags |= PREGf_SKIP;
6885
6886         /* Scan is after the zeroth branch, first is atomic matcher. */
6887 #ifdef TRIE_STUDY_OPT
6888         DEBUG_PARSE_r(
6889             if (!restudied)
6890                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6891                               (IV)(first - scan + 1))
6892         );
6893 #else
6894         DEBUG_PARSE_r(
6895             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6896                 (IV)(first - scan + 1))
6897         );
6898 #endif
6899
6900
6901         /*
6902         * If there's something expensive in the r.e., find the
6903         * longest literal string that must appear and make it the
6904         * regmust.  Resolve ties in favor of later strings, since
6905         * the regstart check works with the beginning of the r.e.
6906         * and avoiding duplication strengthens checking.  Not a
6907         * strong reason, but sufficient in the absence of others.
6908         * [Now we resolve ties in favor of the earlier string if
6909         * it happens that c_offset_min has been invalidated, since the
6910         * earlier string may buy us something the later one won't.]
6911         */
6912
6913         data.longest_fixed = newSVpvs("");
6914         data.longest_float = newSVpvs("");
6915         data.last_found = newSVpvs("");
6916         data.longest = &(data.longest_fixed);
6917         ENTER_with_name("study_chunk");
6918         SAVEFREESV(data.longest_fixed);
6919         SAVEFREESV(data.longest_float);
6920         SAVEFREESV(data.last_found);
6921         first = scan;
6922         if (!ri->regstclass) {
6923             ssc_init(pRExC_state, &ch_class);
6924             data.start_class = &ch_class;
6925             stclass_flag = SCF_DO_STCLASS_AND;
6926         } else                          /* XXXX Check for BOUND? */
6927             stclass_flag = 0;
6928         data.last_closep = &last_close;
6929
6930         DEBUG_RExC_seen();
6931         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6932                              scan + RExC_size, /* Up to end */
6933             &data, -1, 0, NULL,
6934             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6935                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6936             0);
6937
6938
6939         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6940
6941
6942         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6943              && data.last_start_min == 0 && data.last_end > 0
6944              && !RExC_seen_zerolen
6945              && !(RExC_seen & REG_VERBARG_SEEN)
6946              && !(RExC_seen & REG_GPOS_SEEN)
6947         ){
6948             r->extflags |= RXf_CHECK_ALL;
6949         }
6950         scan_commit(pRExC_state, &data,&minlen,0);
6951
6952         longest_float_length = CHR_SVLEN(data.longest_float);
6953
6954         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6955                    && data.offset_fixed == data.offset_float_min
6956                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6957             && S_setup_longest (aTHX_ pRExC_state,
6958                                     data.longest_float,
6959                                     &(r->float_utf8),
6960                                     &(r->float_substr),
6961                                     &(r->float_end_shift),
6962                                     data.lookbehind_float,
6963                                     data.offset_float_min,
6964                                     data.minlen_float,
6965                                     longest_float_length,
6966                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6967                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6968         {
6969             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6970             r->float_max_offset = data.offset_float_max;
6971             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6972                 r->float_max_offset -= data.lookbehind_float;
6973             SvREFCNT_inc_simple_void_NN(data.longest_float);
6974         }
6975         else {
6976             r->float_substr = r->float_utf8 = NULL;
6977             longest_float_length = 0;
6978         }
6979
6980         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6981
6982         if (S_setup_longest (aTHX_ pRExC_state,
6983                                 data.longest_fixed,
6984                                 &(r->anchored_utf8),
6985                                 &(r->anchored_substr),
6986                                 &(r->anchored_end_shift),
6987                                 data.lookbehind_fixed,
6988                                 data.offset_fixed,
6989                                 data.minlen_fixed,
6990                                 longest_fixed_length,
6991                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6992                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6993         {
6994             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6995             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6996         }
6997         else {
6998             r->anchored_substr = r->anchored_utf8 = NULL;
6999             longest_fixed_length = 0;
7000         }
7001         LEAVE_with_name("study_chunk");
7002
7003         if (ri->regstclass
7004             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7005             ri->regstclass = NULL;
7006
7007         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7008             && stclass_flag
7009             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7010             && !ssc_is_anything(data.start_class))
7011         {
7012             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7013
7014             ssc_finalize(pRExC_state, data.start_class);
7015
7016             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7017             StructCopy(data.start_class,
7018                        (regnode_ssc*)RExC_rxi->data->data[n],
7019                        regnode_ssc);
7020             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7021             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7022             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7023                       regprop(r, sv, (regnode*)data.start_class, NULL);
7024                       PerlIO_printf(Perl_debug_log,
7025                                     "synthetic stclass \"%s\".\n",
7026                                     SvPVX_const(sv));});
7027             data.start_class = NULL;
7028         }
7029
7030         /* A temporary algorithm prefers floated substr to fixed one to dig
7031          * more info. */
7032         if (longest_fixed_length > longest_float_length) {
7033             r->substrs->check_ix = 0;
7034             r->check_end_shift = r->anchored_end_shift;
7035             r->check_substr = r->anchored_substr;
7036             r->check_utf8 = r->anchored_utf8;
7037             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7038             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7039                 r->intflags |= PREGf_NOSCAN;
7040         }
7041         else {
7042             r->substrs->check_ix = 1;
7043             r->check_end_shift = r->float_end_shift;
7044             r->check_substr = r->float_substr;
7045             r->check_utf8 = r->float_utf8;
7046             r->check_offset_min = r->float_min_offset;
7047             r->check_offset_max = r->float_max_offset;
7048         }
7049         if ((r->check_substr || r->check_utf8) ) {
7050             r->extflags |= RXf_USE_INTUIT;
7051             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7052                 r->extflags |= RXf_INTUIT_TAIL;
7053         }
7054         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7055
7056         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7057         if ( (STRLEN)minlen < longest_float_length )
7058             minlen= longest_float_length;
7059         if ( (STRLEN)minlen < longest_fixed_length )
7060             minlen= longest_fixed_length;
7061         */
7062     }
7063     else {
7064         /* Several toplevels. Best we can is to set minlen. */
7065         SSize_t fake;
7066         regnode_ssc ch_class;
7067         SSize_t last_close = 0;
7068
7069         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7070
7071         scan = ri->program + 1;
7072         ssc_init(pRExC_state, &ch_class);
7073         data.start_class = &ch_class;
7074         data.last_closep = &last_close;
7075
7076         DEBUG_RExC_seen();
7077         minlen = study_chunk(pRExC_state,
7078             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7079             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7080                                                       ? SCF_TRIE_DOING_RESTUDY
7081                                                       : 0),
7082             0);
7083
7084         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7085
7086         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7087                 = r->float_substr = r->float_utf8 = NULL;
7088
7089         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7090             && ! ssc_is_anything(data.start_class))
7091         {
7092             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7093
7094             ssc_finalize(pRExC_state, data.start_class);
7095
7096             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7097             StructCopy(data.start_class,
7098                        (regnode_ssc*)RExC_rxi->data->data[n],
7099                        regnode_ssc);
7100             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7101             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7102             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7103                       regprop(r, sv, (regnode*)data.start_class, NULL);
7104                       PerlIO_printf(Perl_debug_log,
7105                                     "synthetic stclass \"%s\".\n",
7106                                     SvPVX_const(sv));});
7107             data.start_class = NULL;
7108         }
7109     }
7110
7111     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7112         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7113         r->maxlen = REG_INFTY;
7114     }
7115     else {
7116         r->maxlen = RExC_maxlen;
7117     }
7118
7119     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7120        the "real" pattern. */
7121     DEBUG_OPTIMISE_r({
7122         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7123                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7124     });
7125     r->minlenret = minlen;
7126     if (r->minlen < minlen)
7127         r->minlen = minlen;
7128
7129     if (RExC_seen & REG_GPOS_SEEN)
7130         r->intflags |= PREGf_GPOS_SEEN;
7131     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7132         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7133                                                 lookbehind */
7134     if (pRExC_state->num_code_blocks)
7135         r->extflags |= RXf_EVAL_SEEN;
7136     if (RExC_seen & REG_CANY_SEEN)
7137         r->intflags |= PREGf_CANY_SEEN;
7138     if (RExC_seen & REG_VERBARG_SEEN)
7139     {
7140         r->intflags |= PREGf_VERBARG_SEEN;
7141         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7142     }
7143     if (RExC_seen & REG_CUTGROUP_SEEN)
7144         r->intflags |= PREGf_CUTGROUP_SEEN;
7145     if (pm_flags & PMf_USE_RE_EVAL)
7146         r->intflags |= PREGf_USE_RE_EVAL;
7147     if (RExC_paren_names)
7148         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7149     else
7150         RXp_PAREN_NAMES(r) = NULL;
7151
7152     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7153      * so it can be used in pp.c */
7154     if (r->intflags & PREGf_ANCH)
7155         r->extflags |= RXf_IS_ANCHORED;
7156
7157
7158     {
7159         /* this is used to identify "special" patterns that might result
7160          * in Perl NOT calling the regex engine and instead doing the match "itself",
7161          * particularly special cases in split//. By having the regex compiler
7162          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7163          * we avoid weird issues with equivalent patterns resulting in different behavior,
7164          * AND we allow non Perl engines to get the same optimizations by the setting the
7165          * flags appropriately - Yves */
7166         regnode *first = ri->program + 1;
7167         U8 fop = OP(first);
7168         regnode *next = NEXTOPER(first);
7169         U8 nop = OP(next);
7170
7171         if (PL_regkind[fop] == NOTHING && nop == END)
7172             r->extflags |= RXf_NULL;
7173         else if (PL_regkind[fop] == BOL && nop == END)
7174             r->extflags |= RXf_START_ONLY;
7175         else if (fop == PLUS
7176                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7177                  && OP(regnext(first)) == END)
7178             r->extflags |= RXf_WHITE;
7179         else if ( r->extflags & RXf_SPLIT
7180                   && fop == EXACT
7181                   && STR_LEN(first) == 1
7182                   && *(STRING(first)) == ' '
7183                   && OP(regnext(first)) == END )
7184             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7185
7186     }
7187
7188     if (RExC_contains_locale) {
7189         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7190     }
7191
7192 #ifdef DEBUGGING
7193     if (RExC_paren_names) {
7194         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7195         ri->data->data[ri->name_list_idx]
7196                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7197     } else
7198 #endif
7199         ri->name_list_idx = 0;
7200
7201     if (RExC_recurse_count) {
7202         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7203             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7204             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7205         }
7206     }
7207     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7208     /* assume we don't need to swap parens around before we match */
7209
7210     DEBUG_DUMP_r({
7211         DEBUG_RExC_seen();
7212         PerlIO_printf(Perl_debug_log,"Final program:\n");
7213         regdump(r);
7214     });
7215 #ifdef RE_TRACK_PATTERN_OFFSETS
7216     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7217         const STRLEN len = ri->u.offsets[0];
7218         STRLEN i;
7219         GET_RE_DEBUG_FLAGS_DECL;
7220         PerlIO_printf(Perl_debug_log,
7221                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7222         for (i = 1; i <= len; i++) {
7223             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7224                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7225                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7226             }
7227         PerlIO_printf(Perl_debug_log, "\n");
7228     });
7229 #endif
7230
7231 #ifdef USE_ITHREADS
7232     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7233      * by setting the regexp SV to readonly-only instead. If the
7234      * pattern's been recompiled, the USEDness should remain. */
7235     if (old_re && SvREADONLY(old_re))
7236         SvREADONLY_on(rx);
7237 #endif
7238     return rx;
7239 }
7240
7241
7242 SV*
7243 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7244                     const U32 flags)
7245 {
7246     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7247
7248     PERL_UNUSED_ARG(value);
7249
7250     if (flags & RXapif_FETCH) {
7251         return reg_named_buff_fetch(rx, key, flags);
7252     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7253         Perl_croak_no_modify();
7254         return NULL;
7255     } else if (flags & RXapif_EXISTS) {
7256         return reg_named_buff_exists(rx, key, flags)
7257             ? &PL_sv_yes
7258             : &PL_sv_no;
7259     } else if (flags & RXapif_REGNAMES) {
7260         return reg_named_buff_all(rx, flags);
7261     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7262         return reg_named_buff_scalar(rx, flags);
7263     } else {
7264         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7265         return NULL;
7266     }
7267 }
7268
7269 SV*
7270 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7271                          const U32 flags)
7272 {
7273     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7274     PERL_UNUSED_ARG(lastkey);
7275
7276     if (flags & RXapif_FIRSTKEY)
7277         return reg_named_buff_firstkey(rx, flags);
7278     else if (flags & RXapif_NEXTKEY)
7279         return reg_named_buff_nextkey(rx, flags);
7280     else {
7281         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7282                                             (int)flags);
7283         return NULL;
7284     }
7285 }
7286
7287 SV*
7288 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7289                           const U32 flags)
7290 {
7291     AV *retarray = NULL;
7292     SV *ret;
7293     struct regexp *const rx = ReANY(r);
7294
7295     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7296
7297     if (flags & RXapif_ALL)
7298         retarray=newAV();
7299
7300     if (rx && RXp_PAREN_NAMES(rx)) {
7301         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7302         if (he_str) {
7303             IV i;
7304             SV* sv_dat=HeVAL(he_str);
7305             I32 *nums=(I32*)SvPVX(sv_dat);
7306             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7307                 if ((I32)(rx->nparens) >= nums[i]
7308                     && rx->offs[nums[i]].start != -1
7309                     && rx->offs[nums[i]].end != -1)
7310                 {
7311                     ret = newSVpvs("");
7312                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7313                     if (!retarray)
7314                         return ret;
7315                 } else {
7316                     if (retarray)
7317                         ret = newSVsv(&PL_sv_undef);
7318                 }
7319                 if (retarray)
7320                     av_push(retarray, ret);
7321             }
7322             if (retarray)
7323                 return newRV_noinc(MUTABLE_SV(retarray));
7324         }
7325     }
7326     return NULL;
7327 }
7328
7329 bool
7330 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7331                            const U32 flags)
7332 {
7333     struct regexp *const rx = ReANY(r);
7334
7335     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7336
7337     if (rx && RXp_PAREN_NAMES(rx)) {
7338         if (flags & RXapif_ALL) {
7339             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7340         } else {
7341             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7342             if (sv) {
7343                 SvREFCNT_dec_NN(sv);
7344                 return TRUE;
7345             } else {
7346                 return FALSE;
7347             }
7348         }
7349     } else {
7350         return FALSE;
7351     }
7352 }
7353
7354 SV*
7355 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7356 {
7357     struct regexp *const rx = ReANY(r);
7358
7359     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7360
7361     if ( rx && RXp_PAREN_NAMES(rx) ) {
7362         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7363
7364         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7365     } else {
7366         return FALSE;
7367     }
7368 }
7369
7370 SV*
7371 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7372 {
7373     struct regexp *const rx = ReANY(r);
7374     GET_RE_DEBUG_FLAGS_DECL;
7375
7376     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7377
7378     if (rx && RXp_PAREN_NAMES(rx)) {
7379         HV *hv = RXp_PAREN_NAMES(rx);
7380         HE *temphe;
7381         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7382             IV i;
7383             IV parno = 0;
7384             SV* sv_dat = HeVAL(temphe);
7385             I32 *nums = (I32*)SvPVX(sv_dat);
7386             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7387                 if ((I32)(rx->lastparen) >= nums[i] &&
7388                     rx->offs[nums[i]].start != -1 &&
7389                     rx->offs[nums[i]].end != -1)
7390                 {
7391                     parno = nums[i];
7392                     break;
7393                 }
7394             }
7395             if (parno || flags & RXapif_ALL) {
7396                 return newSVhek(HeKEY_hek(temphe));
7397             }
7398         }
7399     }
7400     return NULL;
7401 }
7402
7403 SV*
7404 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7405 {
7406     SV *ret;
7407     AV *av;
7408     SSize_t length;
7409     struct regexp *const rx = ReANY(r);
7410
7411     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7412
7413     if (rx && RXp_PAREN_NAMES(rx)) {
7414         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7415             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7416         } else if (flags & RXapif_ONE) {
7417             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7418             av = MUTABLE_AV(SvRV(ret));
7419             length = av_tindex(av);
7420             SvREFCNT_dec_NN(ret);
7421             return newSViv(length + 1);
7422         } else {
7423             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7424                                                 (int)flags);
7425             return NULL;
7426         }
7427     }
7428     return &PL_sv_undef;
7429 }
7430
7431 SV*
7432 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7433 {
7434     struct regexp *const rx = ReANY(r);
7435     AV *av = newAV();
7436
7437     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7438
7439     if (rx && RXp_PAREN_NAMES(rx)) {
7440         HV *hv= RXp_PAREN_NAMES(rx);
7441         HE *temphe;
7442         (void)hv_iterinit(hv);
7443         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7444             IV i;
7445             IV parno = 0;
7446             SV* sv_dat = HeVAL(temphe);
7447             I32 *nums = (I32*)SvPVX(sv_dat);
7448             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7449                 if ((I32)(rx->lastparen) >= nums[i] &&
7450                     rx->offs[nums[i]].start != -1 &&
7451                     rx->offs[nums[i]].end != -1)
7452                 {
7453                     parno = nums[i];
7454                     break;
7455                 }
7456             }
7457             if (parno || flags & RXapif_ALL) {
7458                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7459             }
7460         }
7461     }
7462
7463     return newRV_noinc(MUTABLE_SV(av));
7464 }
7465
7466 void
7467 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7468                              SV * const sv)
7469 {
7470     struct regexp *const rx = ReANY(r);
7471     char *s = NULL;
7472     SSize_t i = 0;
7473     SSize_t s1, t1;
7474     I32 n = paren;
7475
7476     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7477
7478     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7479            || n == RX_BUFF_IDX_CARET_FULLMATCH
7480            || n == RX_BUFF_IDX_CARET_POSTMATCH
7481        )
7482     {
7483         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7484         if (!keepcopy) {
7485             /* on something like
7486              *    $r = qr/.../;
7487              *    /$qr/p;
7488              * the KEEPCOPY is set on the PMOP rather than the regex */
7489             if (PL_curpm && r == PM_GETRE(PL_curpm))
7490                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7491         }
7492         if (!keepcopy)
7493             goto ret_undef;
7494     }
7495
7496     if (!rx->subbeg)
7497         goto ret_undef;
7498
7499     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7500         /* no need to distinguish between them any more */
7501         n = RX_BUFF_IDX_FULLMATCH;
7502
7503     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7504         && rx->offs[0].start != -1)
7505     {
7506         /* $`, ${^PREMATCH} */
7507         i = rx->offs[0].start;
7508         s = rx->subbeg;
7509     }
7510     else
7511     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7512         && rx->offs[0].end != -1)
7513     {
7514         /* $', ${^POSTMATCH} */
7515         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7516         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7517     }
7518     else
7519     if ( 0 <= n && n <= (I32)rx->nparens &&
7520         (s1 = rx->offs[n].start) != -1 &&
7521         (t1 = rx->offs[n].end) != -1)
7522     {
7523         /* $&, ${^MATCH},  $1 ... */
7524         i = t1 - s1;
7525         s = rx->subbeg + s1 - rx->suboffset;
7526     } else {
7527         goto ret_undef;
7528     }
7529
7530     assert(s >= rx->subbeg);
7531     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7532     if (i >= 0) {
7533 #ifdef NO_TAINT_SUPPORT
7534         sv_setpvn(sv, s, i);
7535 #else
7536         const int oldtainted = TAINT_get;
7537         TAINT_NOT;
7538         sv_setpvn(sv, s, i);
7539         TAINT_set(oldtainted);
7540 #endif
7541         if ( (rx->intflags & PREGf_CANY_SEEN)
7542             ? (RXp_MATCH_UTF8(rx)
7543                         && (!i || is_utf8_string((U8*)s, i)))
7544             : (RXp_MATCH_UTF8(rx)) )
7545         {
7546             SvUTF8_on(sv);
7547         }
7548         else
7549             SvUTF8_off(sv);
7550         if (TAINTING_get) {
7551             if (RXp_MATCH_TAINTED(rx)) {
7552                 if (SvTYPE(sv) >= SVt_PVMG) {
7553                     MAGIC* const mg = SvMAGIC(sv);
7554                     MAGIC* mgt;
7555                     TAINT;
7556                     SvMAGIC_set(sv, mg->mg_moremagic);
7557                     SvTAINT(sv);
7558                     if ((mgt = SvMAGIC(sv))) {
7559                         mg->mg_moremagic = mgt;
7560                         SvMAGIC_set(sv, mg);
7561                     }
7562                 } else {
7563                     TAINT;
7564                     SvTAINT(sv);
7565                 }
7566             } else
7567                 SvTAINTED_off(sv);
7568         }
7569     } else {
7570       ret_undef:
7571         sv_setsv(sv,&PL_sv_undef);
7572         return;
7573     }
7574 }
7575
7576 void
7577 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7578                                                          SV const * const value)
7579 {
7580     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7581
7582     PERL_UNUSED_ARG(rx);
7583     PERL_UNUSED_ARG(paren);
7584     PERL_UNUSED_ARG(value);
7585
7586     if (!PL_localizing)
7587         Perl_croak_no_modify();
7588 }
7589
7590 I32
7591 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7592                               const I32 paren)
7593 {
7594     struct regexp *const rx = ReANY(r);
7595     I32 i;
7596     I32 s1, t1;
7597
7598     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7599
7600     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7601         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7602         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7603     )
7604     {
7605         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7606         if (!keepcopy) {
7607             /* on something like
7608              *    $r = qr/.../;
7609              *    /$qr/p;
7610              * the KEEPCOPY is set on the PMOP rather than the regex */
7611             if (PL_curpm && r == PM_GETRE(PL_curpm))
7612                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7613         }
7614         if (!keepcopy)
7615             goto warn_undef;
7616     }
7617
7618     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7619     switch (paren) {
7620       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7621       case RX_BUFF_IDX_PREMATCH:       /* $` */
7622         if (rx->offs[0].start != -1) {
7623                         i = rx->offs[0].start;
7624                         if (i > 0) {
7625                                 s1 = 0;
7626                                 t1 = i;
7627                                 goto getlen;
7628                         }
7629             }
7630         return 0;
7631
7632       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7633       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7634             if (rx->offs[0].end != -1) {
7635                         i = rx->sublen - rx->offs[0].end;
7636                         if (i > 0) {
7637                                 s1 = rx->offs[0].end;
7638                                 t1 = rx->sublen;
7639                                 goto getlen;
7640                         }
7641             }
7642         return 0;
7643
7644       default: /* $& / ${^MATCH}, $1, $2, ... */
7645             if (paren <= (I32)rx->nparens &&
7646             (s1 = rx->offs[paren].start) != -1 &&
7647             (t1 = rx->offs[paren].end) != -1)
7648             {
7649             i = t1 - s1;
7650             goto getlen;
7651         } else {
7652           warn_undef:
7653             if (ckWARN(WARN_UNINITIALIZED))
7654                 report_uninit((const SV *)sv);
7655             return 0;
7656         }
7657     }
7658   getlen:
7659     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7660         const char * const s = rx->subbeg - rx->suboffset + s1;
7661         const U8 *ep;
7662         STRLEN el;
7663
7664         i = t1 - s1;
7665         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7666                         i = el;
7667     }
7668     return i;
7669 }
7670
7671 SV*
7672 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7673 {
7674     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7675         PERL_UNUSED_ARG(rx);
7676         if (0)
7677             return NULL;
7678         else
7679             return newSVpvs("Regexp");
7680 }
7681
7682 /* Scans the name of a named buffer from the pattern.
7683  * If flags is REG_RSN_RETURN_NULL returns null.
7684  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7685  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7686  * to the parsed name as looked up in the RExC_paren_names hash.
7687  * If there is an error throws a vFAIL().. type exception.
7688  */
7689
7690 #define REG_RSN_RETURN_NULL    0
7691 #define REG_RSN_RETURN_NAME    1
7692 #define REG_RSN_RETURN_DATA    2
7693
7694 STATIC SV*
7695 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7696 {
7697     char *name_start = RExC_parse;
7698
7699     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7700
7701     assert (RExC_parse <= RExC_end);
7702     if (RExC_parse == RExC_end) NOOP;
7703     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7704          /* skip IDFIRST by using do...while */
7705         if (UTF)
7706             do {
7707                 RExC_parse += UTF8SKIP(RExC_parse);
7708             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7709         else
7710             do {
7711                 RExC_parse++;
7712             } while (isWORDCHAR(*RExC_parse));
7713     } else {
7714         RExC_parse++; /* so the <- from the vFAIL is after the offending
7715                          character */
7716         vFAIL("Group name must start with a non-digit word character");
7717     }
7718     if ( flags ) {
7719         SV* sv_name
7720             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7721                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7722         if ( flags == REG_RSN_RETURN_NAME)
7723             return sv_name;
7724         else if (flags==REG_RSN_RETURN_DATA) {
7725             HE *he_str = NULL;
7726             SV *sv_dat = NULL;
7727             if ( ! sv_name )      /* should not happen*/
7728                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7729             if (RExC_paren_names)
7730                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7731             if ( he_str )
7732                 sv_dat = HeVAL(he_str);
7733             if ( ! sv_dat )
7734                 vFAIL("Reference to nonexistent named group");
7735             return sv_dat;
7736         }
7737         else {
7738             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7739                        (unsigned long) flags);
7740         }
7741         assert(0); /* NOT REACHED */
7742     }
7743     return NULL;
7744 }
7745
7746 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7747     int rem=(int)(RExC_end - RExC_parse);                       \
7748     int cut;                                                    \
7749     int num;                                                    \
7750     int iscut=0;                                                \
7751     if (rem>10) {                                               \
7752         rem=10;                                                 \
7753         iscut=1;                                                \
7754     }                                                           \
7755     cut=10-rem;                                                 \
7756     if (RExC_lastparse!=RExC_parse)                             \
7757         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7758             rem, RExC_parse,                                    \
7759             cut + 4,                                            \
7760             iscut ? "..." : "<"                                 \
7761         );                                                      \
7762     else                                                        \
7763         PerlIO_printf(Perl_debug_log,"%16s","");                \
7764                                                                 \
7765     if (SIZE_ONLY)                                              \
7766        num = RExC_size + 1;                                     \
7767     else                                                        \
7768        num=REG_NODE_NUM(RExC_emit);                             \
7769     if (RExC_lastnum!=num)                                      \
7770        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7771     else                                                        \
7772        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7773     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7774         (int)((depth*2)), "",                                   \
7775         (funcname)                                              \
7776     );                                                          \
7777     RExC_lastnum=num;                                           \
7778     RExC_lastparse=RExC_parse;                                  \
7779 })
7780
7781
7782
7783 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7784     DEBUG_PARSE_MSG((funcname));                            \
7785     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7786 })
7787 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7788     DEBUG_PARSE_MSG((funcname));                            \
7789     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7790 })
7791
7792 /* This section of code defines the inversion list object and its methods.  The
7793  * interfaces are highly subject to change, so as much as possible is static to
7794  * this file.  An inversion list is here implemented as a malloc'd C UV array
7795  * as an SVt_INVLIST scalar.
7796  *
7797  * An inversion list for Unicode is an array of code points, sorted by ordinal
7798  * number.  The zeroth element is the first code point in the list.  The 1th
7799  * element is the first element beyond that not in the list.  In other words,
7800  * the first range is
7801  *  invlist[0]..(invlist[1]-1)
7802  * The other ranges follow.  Thus every element whose index is divisible by two
7803  * marks the beginning of a range that is in the list, and every element not
7804  * divisible by two marks the beginning of a range not in the list.  A single
7805  * element inversion list that contains the single code point N generally
7806  * consists of two elements
7807  *  invlist[0] == N
7808  *  invlist[1] == N+1
7809  * (The exception is when N is the highest representable value on the
7810  * machine, in which case the list containing just it would be a single
7811  * element, itself.  By extension, if the last range in the list extends to
7812  * infinity, then the first element of that range will be in the inversion list
7813  * at a position that is divisible by two, and is the final element in the
7814  * list.)
7815  * Taking the complement (inverting) an inversion list is quite simple, if the
7816  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7817  * This implementation reserves an element at the beginning of each inversion
7818  * list to always contain 0; there is an additional flag in the header which
7819  * indicates if the list begins at the 0, or is offset to begin at the next
7820  * element.
7821  *
7822  * More about inversion lists can be found in "Unicode Demystified"
7823  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7824  * More will be coming when functionality is added later.
7825  *
7826  * The inversion list data structure is currently implemented as an SV pointing
7827  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7828  * array of UV whose memory management is automatically handled by the existing
7829  * facilities for SV's.
7830  *
7831  * Some of the methods should always be private to the implementation, and some
7832  * should eventually be made public */
7833
7834 /* The header definitions are in F<inline_invlist.c> */
7835
7836 PERL_STATIC_INLINE UV*
7837 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7838 {
7839     /* Returns a pointer to the first element in the inversion list's array.
7840      * This is called upon initialization of an inversion list.  Where the
7841      * array begins depends on whether the list has the code point U+0000 in it
7842      * or not.  The other parameter tells it whether the code that follows this
7843      * call is about to put a 0 in the inversion list or not.  The first
7844      * element is either the element reserved for 0, if TRUE, or the element
7845      * after it, if FALSE */
7846
7847     bool* offset = get_invlist_offset_addr(invlist);
7848     UV* zero_addr = (UV *) SvPVX(invlist);
7849
7850     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7851
7852     /* Must be empty */
7853     assert(! _invlist_len(invlist));
7854
7855     *zero_addr = 0;
7856
7857     /* 1^1 = 0; 1^0 = 1 */
7858     *offset = 1 ^ will_have_0;
7859     return zero_addr + *offset;
7860 }
7861
7862 PERL_STATIC_INLINE UV*
7863 S_invlist_array(SV* const invlist)
7864 {
7865     /* Returns the pointer to the inversion list's array.  Every time the
7866      * length changes, this needs to be called in case malloc or realloc moved
7867      * it */
7868
7869     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7870
7871     /* Must not be empty.  If these fail, you probably didn't check for <len>
7872      * being non-zero before trying to get the array */
7873     assert(_invlist_len(invlist));
7874
7875     /* The very first element always contains zero, The array begins either
7876      * there, or if the inversion list is offset, at the element after it.
7877      * The offset header field determines which; it contains 0 or 1 to indicate
7878      * how much additionally to add */
7879     assert(0 == *(SvPVX(invlist)));
7880     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7881 }
7882
7883 PERL_STATIC_INLINE void
7884 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7885 {
7886     /* Sets the current number of elements stored in the inversion list.
7887      * Updates SvCUR correspondingly */
7888     PERL_UNUSED_CONTEXT;
7889     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7890
7891     assert(SvTYPE(invlist) == SVt_INVLIST);
7892
7893     SvCUR_set(invlist,
7894               (len == 0)
7895                ? 0
7896                : TO_INTERNAL_SIZE(len + offset));
7897     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7898 }
7899
7900 PERL_STATIC_INLINE IV*
7901 S_get_invlist_previous_index_addr(SV* invlist)
7902 {
7903     /* Return the address of the IV that is reserved to hold the cached index
7904      * */
7905     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7906
7907     assert(SvTYPE(invlist) == SVt_INVLIST);
7908
7909     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7910 }
7911
7912 PERL_STATIC_INLINE IV
7913 S_invlist_previous_index(SV* const invlist)
7914 {
7915     /* Returns cached index of previous search */
7916
7917     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7918
7919     return *get_invlist_previous_index_addr(invlist);
7920 }
7921
7922 PERL_STATIC_INLINE void
7923 S_invlist_set_previous_index(SV* const invlist, const IV index)
7924 {
7925     /* Caches <index> for later retrieval */
7926
7927     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7928
7929     assert(index == 0 || index < (int) _invlist_len(invlist));
7930
7931     *get_invlist_previous_index_addr(invlist) = index;
7932 }
7933
7934 PERL_STATIC_INLINE UV
7935 S_invlist_max(SV* const invlist)
7936 {
7937     /* Returns the maximum number of elements storable in the inversion list's
7938      * array, without having to realloc() */
7939
7940     PERL_ARGS_ASSERT_INVLIST_MAX;
7941
7942     assert(SvTYPE(invlist) == SVt_INVLIST);
7943
7944     /* Assumes worst case, in which the 0 element is not counted in the
7945      * inversion list, so subtracts 1 for that */
7946     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7947            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7948            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7949 }
7950
7951 #ifndef PERL_IN_XSUB_RE
7952 SV*
7953 Perl__new_invlist(pTHX_ IV initial_size)
7954 {
7955
7956     /* Return a pointer to a newly constructed inversion list, with enough
7957      * space to store 'initial_size' elements.  If that number is negative, a
7958      * system default is used instead */
7959
7960     SV* new_list;
7961
7962     if (initial_size < 0) {
7963         initial_size = 10;
7964     }
7965
7966     /* Allocate the initial space */
7967     new_list = newSV_type(SVt_INVLIST);
7968
7969     /* First 1 is in case the zero element isn't in the list; second 1 is for
7970      * trailing NUL */
7971     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7972     invlist_set_len(new_list, 0, 0);
7973
7974     /* Force iterinit() to be used to get iteration to work */
7975     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7976
7977     *get_invlist_previous_index_addr(new_list) = 0;
7978
7979     return new_list;
7980 }
7981
7982 SV*
7983 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7984 {
7985     /* Return a pointer to a newly constructed inversion list, initialized to
7986      * point to <list>, which has to be in the exact correct inversion list
7987      * form, including internal fields.  Thus this is a dangerous routine that
7988      * should not be used in the wrong hands.  The passed in 'list' contains
7989      * several header fields at the beginning that are not part of the
7990      * inversion list body proper */
7991
7992     const STRLEN length = (STRLEN) list[0];
7993     const UV version_id =          list[1];
7994     const bool offset   =    cBOOL(list[2]);
7995 #define HEADER_LENGTH 3
7996     /* If any of the above changes in any way, you must change HEADER_LENGTH
7997      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7998      *      perl -E 'say int(rand 2**31-1)'
7999      */
8000 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8001                                         data structure type, so that one being
8002                                         passed in can be validated to be an
8003                                         inversion list of the correct vintage.
8004                                        */
8005
8006     SV* invlist = newSV_type(SVt_INVLIST);
8007
8008     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8009
8010     if (version_id != INVLIST_VERSION_ID) {
8011         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8012     }
8013
8014     /* The generated array passed in includes header elements that aren't part
8015      * of the list proper, so start it just after them */
8016     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8017
8018     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8019                                shouldn't touch it */
8020
8021     *(get_invlist_offset_addr(invlist)) = offset;
8022
8023     /* The 'length' passed to us is the physical number of elements in the
8024      * inversion list.  But if there is an offset the logical number is one
8025      * less than that */
8026     invlist_set_len(invlist, length  - offset, offset);
8027
8028     invlist_set_previous_index(invlist, 0);
8029
8030     /* Initialize the iteration pointer. */
8031     invlist_iterfinish(invlist);
8032
8033     SvREADONLY_on(invlist);
8034
8035     return invlist;
8036 }
8037 #endif /* ifndef PERL_IN_XSUB_RE */
8038
8039 STATIC void
8040 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8041 {
8042     /* Grow the maximum size of an inversion list */
8043
8044     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8045
8046     assert(SvTYPE(invlist) == SVt_INVLIST);
8047
8048     /* Add one to account for the zero element at the beginning which may not
8049      * be counted by the calling parameters */
8050     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8051 }
8052
8053 PERL_STATIC_INLINE void
8054 S_invlist_trim(SV* const invlist)
8055 {
8056     PERL_ARGS_ASSERT_INVLIST_TRIM;
8057
8058     assert(SvTYPE(invlist) == SVt_INVLIST);
8059
8060     /* Change the length of the inversion list to how many entries it currently
8061      * has */
8062     SvPV_shrink_to_cur((SV *) invlist);
8063 }
8064
8065 STATIC void
8066 S__append_range_to_invlist(pTHX_ SV* const invlist,
8067                                  const UV start, const UV end)
8068 {
8069    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8070     * the end of the inversion list.  The range must be above any existing
8071     * ones. */
8072
8073     UV* array;
8074     UV max = invlist_max(invlist);
8075     UV len = _invlist_len(invlist);
8076     bool offset;
8077
8078     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8079
8080     if (len == 0) { /* Empty lists must be initialized */
8081         offset = start != 0;
8082         array = _invlist_array_init(invlist, ! offset);
8083     }
8084     else {
8085         /* Here, the existing list is non-empty. The current max entry in the
8086          * list is generally the first value not in the set, except when the
8087          * set extends to the end of permissible values, in which case it is
8088          * the first entry in that final set, and so this call is an attempt to
8089          * append out-of-order */
8090
8091         UV final_element = len - 1;
8092         array = invlist_array(invlist);
8093         if (array[final_element] > start
8094             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8095         {
8096             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8097                      array[final_element], start,
8098                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8099         }
8100
8101         /* Here, it is a legal append.  If the new range begins with the first
8102          * value not in the set, it is extending the set, so the new first
8103          * value not in the set is one greater than the newly extended range.
8104          * */
8105         offset = *get_invlist_offset_addr(invlist);
8106         if (array[final_element] == start) {
8107             if (end != UV_MAX) {
8108                 array[final_element] = end + 1;
8109             }
8110             else {
8111                 /* But if the end is the maximum representable on the machine,
8112                  * just let the range that this would extend to have no end */
8113                 invlist_set_len(invlist, len - 1, offset);
8114             }
8115             return;
8116         }
8117     }
8118
8119     /* Here the new range doesn't extend any existing set.  Add it */
8120
8121     len += 2;   /* Includes an element each for the start and end of range */
8122
8123     /* If wll overflow the existing space, extend, which may cause the array to
8124      * be moved */
8125     if (max < len) {
8126         invlist_extend(invlist, len);
8127
8128         /* Have to set len here to avoid assert failure in invlist_array() */
8129         invlist_set_len(invlist, len, offset);
8130
8131         array = invlist_array(invlist);
8132     }
8133     else {
8134         invlist_set_len(invlist, len, offset);
8135     }
8136
8137     /* The next item on the list starts the range, the one after that is
8138      * one past the new range.  */
8139     array[len - 2] = start;
8140     if (end != UV_MAX) {
8141         array[len - 1] = end + 1;
8142     }
8143     else {
8144         /* But if the end is the maximum representable on the machine, just let
8145          * the range have no end */
8146         invlist_set_len(invlist, len - 1, offset);
8147     }
8148 }
8149
8150 #ifndef PERL_IN_XSUB_RE
8151
8152 IV
8153 Perl__invlist_search(SV* const invlist, const UV cp)
8154 {
8155     /* Searches the inversion list for the entry that contains the input code
8156      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8157      * return value is the index into the list's array of the range that
8158      * contains <cp> */
8159
8160     IV low = 0;
8161     IV mid;
8162     IV high = _invlist_len(invlist);
8163     const IV highest_element = high - 1;
8164     const UV* array;
8165
8166     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8167
8168     /* If list is empty, return failure. */
8169     if (high == 0) {
8170         return -1;
8171     }
8172
8173     /* (We can't get the array unless we know the list is non-empty) */
8174     array = invlist_array(invlist);
8175
8176     mid = invlist_previous_index(invlist);
8177     assert(mid >=0 && mid <= highest_element);
8178
8179     /* <mid> contains the cache of the result of the previous call to this
8180      * function (0 the first time).  See if this call is for the same result,
8181      * or if it is for mid-1.  This is under the theory that calls to this
8182      * function will often be for related code points that are near each other.
8183      * And benchmarks show that caching gives better results.  We also test
8184      * here if the code point is within the bounds of the list.  These tests
8185      * replace others that would have had to be made anyway to make sure that
8186      * the array bounds were not exceeded, and these give us extra information
8187      * at the same time */
8188     if (cp >= array[mid]) {
8189         if (cp >= array[highest_element]) {
8190             return highest_element;
8191         }
8192
8193         /* Here, array[mid] <= cp < array[highest_element].  This means that
8194          * the final element is not the answer, so can exclude it; it also
8195          * means that <mid> is not the final element, so can refer to 'mid + 1'
8196          * safely */
8197         if (cp < array[mid + 1]) {
8198             return mid;
8199         }
8200         high--;
8201         low = mid + 1;
8202     }
8203     else { /* cp < aray[mid] */
8204         if (cp < array[0]) { /* Fail if outside the array */
8205             return -1;
8206         }
8207         high = mid;
8208         if (cp >= array[mid - 1]) {
8209             goto found_entry;
8210         }
8211     }
8212
8213     /* Binary search.  What we are looking for is <i> such that
8214      *  array[i] <= cp < array[i+1]
8215      * The loop below converges on the i+1.  Note that there may not be an
8216      * (i+1)th element in the array, and things work nonetheless */
8217     while (low < high) {
8218         mid = (low + high) / 2;
8219         assert(mid <= highest_element);
8220         if (array[mid] <= cp) { /* cp >= array[mid] */
8221             low = mid + 1;
8222
8223             /* We could do this extra test to exit the loop early.
8224             if (cp < array[low]) {
8225                 return mid;
8226             }
8227             */
8228         }
8229         else { /* cp < array[mid] */
8230             high = mid;
8231         }
8232     }
8233
8234   found_entry:
8235     high--;
8236     invlist_set_previous_index(invlist, high);
8237     return high;
8238 }
8239
8240 void
8241 Perl__invlist_populate_swatch(SV* const invlist,
8242                               const UV start, const UV end, U8* swatch)
8243 {
8244     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8245      * but is used when the swash has an inversion list.  This makes this much
8246      * faster, as it uses a binary search instead of a linear one.  This is
8247      * intimately tied to that function, and perhaps should be in utf8.c,
8248      * except it is intimately tied to inversion lists as well.  It assumes
8249      * that <swatch> is all 0's on input */
8250
8251     UV current = start;
8252     const IV len = _invlist_len(invlist);
8253     IV i;
8254     const UV * array;
8255
8256     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8257
8258     if (len == 0) { /* Empty inversion list */
8259         return;
8260     }
8261
8262     array = invlist_array(invlist);
8263
8264     /* Find which element it is */
8265     i = _invlist_search(invlist, start);
8266
8267     /* We populate from <start> to <end> */
8268     while (current < end) {
8269         UV upper;
8270
8271         /* The inversion list gives the results for every possible code point
8272          * after the first one in the list.  Only those ranges whose index is
8273          * even are ones that the inversion list matches.  For the odd ones,
8274          * and if the initial code point is not in the list, we have to skip
8275          * forward to the next element */
8276         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8277             i++;
8278             if (i >= len) { /* Finished if beyond the end of the array */
8279                 return;
8280             }
8281             current = array[i];
8282             if (current >= end) {   /* Finished if beyond the end of what we
8283                                        are populating */
8284                 if (LIKELY(end < UV_MAX)) {
8285                     return;
8286                 }
8287
8288                 /* We get here when the upper bound is the maximum
8289                  * representable on the machine, and we are looking for just
8290                  * that code point.  Have to special case it */
8291                 i = len;
8292                 goto join_end_of_list;
8293             }
8294         }
8295         assert(current >= start);
8296
8297         /* The current range ends one below the next one, except don't go past
8298          * <end> */
8299         i++;
8300         upper = (i < len && array[i] < end) ? array[i] : end;
8301
8302         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8303          * for each code point in it */
8304         for (; current < upper; current++) {
8305             const STRLEN offset = (STRLEN)(current - start);
8306             swatch[offset >> 3] |= 1 << (offset & 7);
8307         }
8308
8309     join_end_of_list:
8310
8311         /* Quit if at the end of the list */
8312         if (i >= len) {
8313
8314             /* But first, have to deal with the highest possible code point on
8315              * the platform.  The previous code assumes that <end> is one
8316              * beyond where we want to populate, but that is impossible at the
8317              * platform's infinity, so have to handle it specially */
8318             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8319             {
8320                 const STRLEN offset = (STRLEN)(end - start);
8321                 swatch[offset >> 3] |= 1 << (offset & 7);
8322             }
8323             return;
8324         }
8325
8326         /* Advance to the next range, which will be for code points not in the
8327          * inversion list */
8328         current = array[i];
8329     }
8330
8331     return;
8332 }
8333
8334 void
8335 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8336                                          const bool complement_b, SV** output)
8337 {
8338     /* Take the union of two inversion lists and point <output> to it.  *output
8339      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8340      * the reference count to that list will be decremented if not already a
8341      * temporary (mortal); otherwise *output will be made correspondingly
8342      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8343      * second list is returned.  If <complement_b> is TRUE, the union is taken
8344      * of the complement (inversion) of <b> instead of b itself.
8345      *
8346      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8347      * Richard Gillam, published by Addison-Wesley, and explained at some
8348      * length there.  The preface says to incorporate its examples into your
8349      * code at your own risk.
8350      *
8351      * The algorithm is like a merge sort.
8352      *
8353      * XXX A potential performance improvement is to keep track as we go along
8354      * if only one of the inputs contributes to the result, meaning the other
8355      * is a subset of that one.  In that case, we can skip the final copy and
8356      * return the larger of the input lists, but then outside code might need
8357      * to keep track of whether to free the input list or not */
8358
8359     const UV* array_a;    /* a's array */
8360     const UV* array_b;
8361     UV len_a;       /* length of a's array */
8362     UV len_b;
8363
8364     SV* u;                      /* the resulting union */
8365     UV* array_u;
8366     UV len_u;
8367
8368     UV i_a = 0;             /* current index into a's array */
8369     UV i_b = 0;
8370     UV i_u = 0;
8371
8372     /* running count, as explained in the algorithm source book; items are
8373      * stopped accumulating and are output when the count changes to/from 0.
8374      * The count is incremented when we start a range that's in the set, and
8375      * decremented when we start a range that's not in the set.  So its range
8376      * is 0 to 2.  Only when the count is zero is something not in the set.
8377      */
8378     UV count = 0;
8379
8380     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8381     assert(a != b);
8382
8383     /* If either one is empty, the union is the other one */
8384     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8385         bool make_temp = FALSE; /* Should we mortalize the result? */
8386
8387         if (*output == a) {
8388             if (a != NULL) {
8389                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8390                     SvREFCNT_dec_NN(a);
8391                 }
8392             }
8393         }
8394         if (*output != b) {
8395             *output = invlist_clone(b);
8396             if (complement_b) {
8397                 _invlist_invert(*output);
8398             }
8399         } /* else *output already = b; */
8400
8401         if (make_temp) {
8402             sv_2mortal(*output);
8403         }
8404         return;
8405     }
8406     else if ((len_b = _invlist_len(b)) == 0) {
8407         bool make_temp = FALSE;
8408         if (*output == b) {
8409             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8410                 SvREFCNT_dec_NN(b);
8411             }
8412         }
8413
8414         /* The complement of an empty list is a list that has everything in it,
8415          * so the union with <a> includes everything too */
8416         if (complement_b) {
8417             if (a == *output) {
8418                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8419                     SvREFCNT_dec_NN(a);
8420                 }
8421             }
8422             *output = _new_invlist(1);
8423             _append_range_to_invlist(*output, 0, UV_MAX);
8424         }
8425         else if (*output != a) {
8426             *output = invlist_clone(a);
8427         }
8428         /* else *output already = a; */
8429
8430         if (make_temp) {
8431             sv_2mortal(*output);
8432         }
8433         return;
8434     }
8435
8436     /* Here both lists exist and are non-empty */
8437     array_a = invlist_array(a);
8438     array_b = invlist_array(b);
8439
8440     /* If are to take the union of 'a' with the complement of b, set it
8441      * up so are looking at b's complement. */
8442     if (complement_b) {
8443
8444         /* To complement, we invert: if the first element is 0, remove it.  To
8445          * do this, we just pretend the array starts one later */
8446         if (array_b[0] == 0) {
8447             array_b++;
8448             len_b--;
8449         }
8450         else {
8451
8452             /* But if the first element is not zero, we pretend the list starts
8453              * at the 0 that is always stored immediately before the array. */
8454             array_b--;
8455             len_b++;
8456         }
8457     }
8458
8459     /* Size the union for the worst case: that the sets are completely
8460      * disjoint */
8461     u = _new_invlist(len_a + len_b);
8462
8463     /* Will contain U+0000 if either component does */
8464     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8465                                       || (len_b > 0 && array_b[0] == 0));
8466
8467     /* Go through each list item by item, stopping when exhausted one of
8468      * them */
8469     while (i_a < len_a && i_b < len_b) {
8470         UV cp;      /* The element to potentially add to the union's array */
8471         bool cp_in_set;   /* is it in the the input list's set or not */
8472
8473         /* We need to take one or the other of the two inputs for the union.
8474          * Since we are merging two sorted lists, we take the smaller of the
8475          * next items.  In case of a tie, we take the one that is in its set
8476          * first.  If we took one not in the set first, it would decrement the
8477          * count, possibly to 0 which would cause it to be output as ending the
8478          * range, and the next time through we would take the same number, and
8479          * output it again as beginning the next range.  By doing it the
8480          * opposite way, there is no possibility that the count will be
8481          * momentarily decremented to 0, and thus the two adjoining ranges will
8482          * be seamlessly merged.  (In a tie and both are in the set or both not
8483          * in the set, it doesn't matter which we take first.) */
8484         if (array_a[i_a] < array_b[i_b]
8485             || (array_a[i_a] == array_b[i_b]
8486                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8487         {
8488             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8489             cp= array_a[i_a++];
8490         }
8491         else {
8492             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8493             cp = array_b[i_b++];
8494         }
8495
8496         /* Here, have chosen which of the two inputs to look at.  Only output
8497          * if the running count changes to/from 0, which marks the
8498          * beginning/end of a range in that's in the set */
8499         if (cp_in_set) {
8500             if (count == 0) {
8501                 array_u[i_u++] = cp;
8502             }
8503             count++;
8504         }
8505         else {
8506             count--;
8507             if (count == 0) {
8508                 array_u[i_u++] = cp;
8509             }
8510         }
8511     }
8512
8513     /* Here, we are finished going through at least one of the lists, which
8514      * means there is something remaining in at most one.  We check if the list
8515      * that hasn't been exhausted is positioned such that we are in the middle
8516      * of a range in its set or not.  (i_a and i_b point to the element beyond
8517      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8518      * is potentially more to output.
8519      * There are four cases:
8520      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8521      *     in the union is entirely from the non-exhausted set.
8522      *  2) Both were in their sets, count is 2.  Nothing further should
8523      *     be output, as everything that remains will be in the exhausted
8524      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8525      *     that
8526      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8527      *     Nothing further should be output because the union includes
8528      *     everything from the exhausted set.  Not decrementing ensures that.
8529      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8530      *     decrementing to 0 insures that we look at the remainder of the
8531      *     non-exhausted set */
8532     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8533         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8534     {
8535         count--;
8536     }
8537
8538     /* The final length is what we've output so far, plus what else is about to
8539      * be output.  (If 'count' is non-zero, then the input list we exhausted
8540      * has everything remaining up to the machine's limit in its set, and hence
8541      * in the union, so there will be no further output. */
8542     len_u = i_u;
8543     if (count == 0) {
8544         /* At most one of the subexpressions will be non-zero */
8545         len_u += (len_a - i_a) + (len_b - i_b);
8546     }
8547
8548     /* Set result to final length, which can change the pointer to array_u, so
8549      * re-find it */
8550     if (len_u != _invlist_len(u)) {
8551         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8552         invlist_trim(u);
8553         array_u = invlist_array(u);
8554     }
8555
8556     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8557      * the other) ended with everything above it not in its set.  That means
8558      * that the remaining part of the union is precisely the same as the
8559      * non-exhausted list, so can just copy it unchanged.  (If both list were
8560      * exhausted at the same time, then the operations below will be both 0.)
8561      */
8562     if (count == 0) {
8563         IV copy_count; /* At most one will have a non-zero copy count */
8564         if ((copy_count = len_a - i_a) > 0) {
8565             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8566         }
8567         else if ((copy_count = len_b - i_b) > 0) {
8568             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8569         }
8570     }
8571
8572     /*  We may be removing a reference to one of the inputs.  If so, the output
8573      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8574      *  count decremented) */
8575     if (a == *output || b == *output) {
8576         assert(! invlist_is_iterating(*output));
8577         if ((SvTEMP(*output))) {
8578             sv_2mortal(u);
8579         }
8580         else {
8581             SvREFCNT_dec_NN(*output);
8582         }
8583     }
8584
8585     *output = u;
8586
8587     return;
8588 }
8589
8590 void
8591 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8592                                                const bool complement_b, SV** i)
8593 {
8594     /* Take the intersection of two inversion lists and point <i> to it.  *i
8595      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8596      * the reference count to that list will be decremented if not already a
8597      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8598      * The first list, <a>, may be NULL, in which case an empty list is
8599      * returned.  If <complement_b> is TRUE, the result will be the
8600      * intersection of <a> and the complement (or inversion) of <b> instead of
8601      * <b> directly.
8602      *
8603      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8604      * Richard Gillam, published by Addison-Wesley, and explained at some
8605      * length there.  The preface says to incorporate its examples into your
8606      * code at your own risk.  In fact, it had bugs
8607      *
8608      * The algorithm is like a merge sort, and is essentially the same as the
8609      * union above
8610      */
8611
8612     const UV* array_a;          /* a's array */
8613     const UV* array_b;
8614     UV len_a;   /* length of a's array */
8615     UV len_b;
8616
8617     SV* r;                   /* the resulting intersection */
8618     UV* array_r;
8619     UV len_r;
8620
8621     UV i_a = 0;             /* current index into a's array */
8622     UV i_b = 0;
8623     UV i_r = 0;
8624
8625     /* running count, as explained in the algorithm source book; items are
8626      * stopped accumulating and are output when the count changes to/from 2.
8627      * The count is incremented when we start a range that's in the set, and
8628      * decremented when we start a range that's not in the set.  So its range
8629      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8630      */
8631     UV count = 0;
8632
8633     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8634     assert(a != b);
8635
8636     /* Special case if either one is empty */
8637     len_a = (a == NULL) ? 0 : _invlist_len(a);
8638     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8639         bool make_temp = FALSE;
8640
8641         if (len_a != 0 && complement_b) {
8642
8643             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8644              * be empty.  Here, also we are using 'b's complement, which hence
8645              * must be every possible code point.  Thus the intersection is
8646              * simply 'a'. */
8647             if (*i != a) {
8648                 if (*i == b) {
8649                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8650                         SvREFCNT_dec_NN(b);
8651                     }
8652                 }
8653
8654                 *i = invlist_clone(a);
8655             }
8656             /* else *i is already 'a' */
8657
8658             if (make_temp) {
8659                 sv_2mortal(*i);
8660             }
8661             return;
8662         }
8663
8664         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8665          * intersection must be empty */
8666         if (*i == a) {
8667             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8668                 SvREFCNT_dec_NN(a);
8669             }
8670         }
8671         else if (*i == b) {
8672             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8673                 SvREFCNT_dec_NN(b);
8674             }
8675         }
8676         *i = _new_invlist(0);
8677         if (make_temp) {
8678             sv_2mortal(*i);
8679         }
8680
8681         return;
8682     }
8683
8684     /* Here both lists exist and are non-empty */
8685     array_a = invlist_array(a);
8686     array_b = invlist_array(b);
8687
8688     /* If are to take the intersection of 'a' with the complement of b, set it
8689      * up so are looking at b's complement. */
8690     if (complement_b) {
8691
8692         /* To complement, we invert: if the first element is 0, remove it.  To
8693          * do this, we just pretend the array starts one later */
8694         if (array_b[0] == 0) {
8695             array_b++;
8696             len_b--;
8697         }
8698         else {
8699
8700             /* But if the first element is not zero, we pretend the list starts
8701              * at the 0 that is always stored immediately before the array. */
8702             array_b--;
8703             len_b++;
8704         }
8705     }
8706
8707     /* Size the intersection for the worst case: that the intersection ends up
8708      * fragmenting everything to be completely disjoint */
8709     r= _new_invlist(len_a + len_b);
8710
8711     /* Will contain U+0000 iff both components do */
8712     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8713                                      && len_b > 0 && array_b[0] == 0);
8714
8715     /* Go through each list item by item, stopping when exhausted one of
8716      * them */
8717     while (i_a < len_a && i_b < len_b) {
8718         UV cp;      /* The element to potentially add to the intersection's
8719                        array */
8720         bool cp_in_set; /* Is it in the input list's set or not */
8721
8722         /* We need to take one or the other of the two inputs for the
8723          * intersection.  Since we are merging two sorted lists, we take the
8724          * smaller of the next items.  In case of a tie, we take the one that
8725          * is not in its set first (a difference from the union algorithm).  If
8726          * we took one in the set first, it would increment the count, possibly
8727          * to 2 which would cause it to be output as starting a range in the
8728          * intersection, and the next time through we would take that same
8729          * number, and output it again as ending the set.  By doing it the
8730          * opposite of this, there is no possibility that the count will be
8731          * momentarily incremented to 2.  (In a tie and both are in the set or
8732          * both not in the set, it doesn't matter which we take first.) */
8733         if (array_a[i_a] < array_b[i_b]
8734             || (array_a[i_a] == array_b[i_b]
8735                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8736         {
8737             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8738             cp= array_a[i_a++];
8739         }
8740         else {
8741             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8742             cp= array_b[i_b++];
8743         }
8744
8745         /* Here, have chosen which of the two inputs to look at.  Only output
8746          * if the running count changes to/from 2, which marks the
8747          * beginning/end of a range that's in the intersection */
8748         if (cp_in_set) {
8749             count++;
8750             if (count == 2) {
8751                 array_r[i_r++] = cp;
8752             }
8753         }
8754         else {
8755             if (count == 2) {
8756                 array_r[i_r++] = cp;
8757             }
8758             count--;
8759         }
8760     }
8761
8762     /* Here, we are finished going through at least one of the lists, which
8763      * means there is something remaining in at most one.  We check if the list
8764      * that has been exhausted is positioned such that we are in the middle
8765      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8766      * the ones we care about.)  There are four cases:
8767      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8768      *     nothing left in the intersection.
8769      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8770      *     above 2.  What should be output is exactly that which is in the
8771      *     non-exhausted set, as everything it has is also in the intersection
8772      *     set, and everything it doesn't have can't be in the intersection
8773      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8774      *     gets incremented to 2.  Like the previous case, the intersection is
8775      *     everything that remains in the non-exhausted set.
8776      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8777      *     remains 1.  And the intersection has nothing more. */
8778     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8779         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8780     {
8781         count++;
8782     }
8783
8784     /* The final length is what we've output so far plus what else is in the
8785      * intersection.  At most one of the subexpressions below will be non-zero
8786      * */
8787     len_r = i_r;
8788     if (count >= 2) {
8789         len_r += (len_a - i_a) + (len_b - i_b);
8790     }
8791
8792     /* Set result to final length, which can change the pointer to array_r, so
8793      * re-find it */
8794     if (len_r != _invlist_len(r)) {
8795         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8796         invlist_trim(r);
8797         array_r = invlist_array(r);
8798     }
8799
8800     /* Finish outputting any remaining */
8801     if (count >= 2) { /* At most one will have a non-zero copy count */
8802         IV copy_count;
8803         if ((copy_count = len_a - i_a) > 0) {
8804             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8805         }
8806         else if ((copy_count = len_b - i_b) > 0) {
8807             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8808         }
8809     }
8810
8811     /*  We may be removing a reference to one of the inputs.  If so, the output
8812      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8813      *  count decremented) */
8814     if (a == *i || b == *i) {
8815         assert(! invlist_is_iterating(*i));
8816         if (SvTEMP(*i)) {
8817             sv_2mortal(r);
8818         }
8819         else {
8820             SvREFCNT_dec_NN(*i);
8821         }
8822     }
8823
8824     *i = r;
8825
8826     return;
8827 }
8828
8829 SV*
8830 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8831 {
8832     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8833      * set.  A pointer to the inversion list is returned.  This may actually be
8834      * a new list, in which case the passed in one has been destroyed.  The
8835      * passed in inversion list can be NULL, in which case a new one is created
8836      * with just the one range in it */
8837
8838     SV* range_invlist;
8839     UV len;
8840
8841     if (invlist == NULL) {
8842         invlist = _new_invlist(2);
8843         len = 0;
8844     }
8845     else {
8846         len = _invlist_len(invlist);
8847     }
8848
8849     /* If comes after the final entry actually in the list, can just append it
8850      * to the end, */
8851     if (len == 0
8852         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8853             && start >= invlist_array(invlist)[len - 1]))
8854     {
8855         _append_range_to_invlist(invlist, start, end);
8856         return invlist;
8857     }
8858
8859     /* Here, can't just append things, create and return a new inversion list
8860      * which is the union of this range and the existing inversion list */
8861     range_invlist = _new_invlist(2);
8862     _append_range_to_invlist(range_invlist, start, end);
8863
8864     _invlist_union(invlist, range_invlist, &invlist);
8865
8866     /* The temporary can be freed */
8867     SvREFCNT_dec_NN(range_invlist);
8868
8869     return invlist;
8870 }
8871
8872 SV*
8873 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8874                                  UV** other_elements_ptr)
8875 {
8876     /* Create and return an inversion list whose contents are to be populated
8877      * by the caller.  The caller gives the number of elements (in 'size') and
8878      * the very first element ('element0').  This function will set
8879      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8880      * are to be placed.
8881      *
8882      * Obviously there is some trust involved that the caller will properly
8883      * fill in the other elements of the array.
8884      *
8885      * (The first element needs to be passed in, as the underlying code does
8886      * things differently depending on whether it is zero or non-zero) */
8887
8888     SV* invlist = _new_invlist(size);
8889     bool offset;
8890
8891     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8892
8893     _append_range_to_invlist(invlist, element0, element0);
8894     offset = *get_invlist_offset_addr(invlist);
8895
8896     invlist_set_len(invlist, size, offset);
8897     *other_elements_ptr = invlist_array(invlist) + 1;
8898     return invlist;
8899 }
8900
8901 #endif
8902
8903 PERL_STATIC_INLINE SV*
8904 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8905     return _add_range_to_invlist(invlist, cp, cp);
8906 }
8907
8908 #ifndef PERL_IN_XSUB_RE
8909 void
8910 Perl__invlist_invert(pTHX_ SV* const invlist)
8911 {
8912     /* Complement the input inversion list.  This adds a 0 if the list didn't
8913      * have a zero; removes it otherwise.  As described above, the data
8914      * structure is set up so that this is very efficient */
8915
8916     PERL_ARGS_ASSERT__INVLIST_INVERT;
8917
8918     assert(! invlist_is_iterating(invlist));
8919
8920     /* The inverse of matching nothing is matching everything */
8921     if (_invlist_len(invlist) == 0) {
8922         _append_range_to_invlist(invlist, 0, UV_MAX);
8923         return;
8924     }
8925
8926     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8927 }
8928
8929 #endif
8930
8931 PERL_STATIC_INLINE SV*
8932 S_invlist_clone(pTHX_ SV* const invlist)
8933 {
8934
8935     /* Return a new inversion list that is a copy of the input one, which is
8936      * unchanged.  The new list will not be mortal even if the old one was. */
8937
8938     /* Need to allocate extra space to accommodate Perl's addition of a
8939      * trailing NUL to SvPV's, since it thinks they are always strings */
8940     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8941     STRLEN physical_length = SvCUR(invlist);
8942     bool offset = *(get_invlist_offset_addr(invlist));
8943
8944     PERL_ARGS_ASSERT_INVLIST_CLONE;
8945
8946     *(get_invlist_offset_addr(new_invlist)) = offset;
8947     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8948     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8949
8950     return new_invlist;
8951 }
8952
8953 PERL_STATIC_INLINE STRLEN*
8954 S_get_invlist_iter_addr(SV* invlist)
8955 {
8956     /* Return the address of the UV that contains the current iteration
8957      * position */
8958
8959     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8960
8961     assert(SvTYPE(invlist) == SVt_INVLIST);
8962
8963     return &(((XINVLIST*) SvANY(invlist))->iterator);
8964 }
8965
8966 PERL_STATIC_INLINE void
8967 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8968 {
8969     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8970
8971     *get_invlist_iter_addr(invlist) = 0;
8972 }
8973
8974 PERL_STATIC_INLINE void
8975 S_invlist_iterfinish(SV* invlist)
8976 {
8977     /* Terminate iterator for invlist.  This is to catch development errors.
8978      * Any iteration that is interrupted before completed should call this
8979      * function.  Functions that add code points anywhere else but to the end
8980      * of an inversion list assert that they are not in the middle of an
8981      * iteration.  If they were, the addition would make the iteration
8982      * problematical: if the iteration hadn't reached the place where things
8983      * were being added, it would be ok */
8984
8985     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8986
8987     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8988 }
8989
8990 STATIC bool
8991 S_invlist_iternext(SV* invlist, UV* start, UV* end)
8992 {
8993     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8994      * This call sets in <*start> and <*end>, the next range in <invlist>.
8995      * Returns <TRUE> if successful and the next call will return the next
8996      * range; <FALSE> if was already at the end of the list.  If the latter,
8997      * <*start> and <*end> are unchanged, and the next call to this function
8998      * will start over at the beginning of the list */
8999
9000     STRLEN* pos = get_invlist_iter_addr(invlist);
9001     UV len = _invlist_len(invlist);
9002     UV *array;
9003
9004     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9005
9006     if (*pos >= len) {
9007         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9008         return FALSE;
9009     }
9010
9011     array = invlist_array(invlist);
9012
9013     *start = array[(*pos)++];
9014
9015     if (*pos >= len) {
9016         *end = UV_MAX;
9017     }
9018     else {
9019         *end = array[(*pos)++] - 1;
9020     }
9021
9022     return TRUE;
9023 }
9024
9025 PERL_STATIC_INLINE bool
9026 S_invlist_is_iterating(SV* const invlist)
9027 {
9028     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9029
9030     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9031 }
9032
9033 PERL_STATIC_INLINE UV
9034 S_invlist_highest(SV* const invlist)
9035 {
9036     /* Returns the highest code point that matches an inversion list.  This API
9037      * has an ambiguity, as it returns 0 under either the highest is actually
9038      * 0, or if the list is empty.  If this distinction matters to you, check
9039      * for emptiness before calling this function */
9040
9041     UV len = _invlist_len(invlist);
9042     UV *array;
9043
9044     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9045
9046     if (len == 0) {
9047         return 0;
9048     }
9049
9050     array = invlist_array(invlist);
9051
9052     /* The last element in the array in the inversion list always starts a
9053      * range that goes to infinity.  That range may be for code points that are
9054      * matched in the inversion list, or it may be for ones that aren't
9055      * matched.  In the latter case, the highest code point in the set is one
9056      * less than the beginning of this range; otherwise it is the final element
9057      * of this range: infinity */
9058     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9059            ? UV_MAX
9060            : array[len - 1] - 1;
9061 }
9062
9063 #ifndef PERL_IN_XSUB_RE
9064 SV *
9065 Perl__invlist_contents(pTHX_ SV* const invlist)
9066 {
9067     /* Get the contents of an inversion list into a string SV so that they can
9068      * be printed out.  It uses the format traditionally done for debug tracing
9069      */
9070
9071     UV start, end;
9072     SV* output = newSVpvs("\n");
9073
9074     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9075
9076     assert(! invlist_is_iterating(invlist));
9077
9078     invlist_iterinit(invlist);
9079     while (invlist_iternext(invlist, &start, &end)) {
9080         if (end == UV_MAX) {
9081             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9082         }
9083         else if (end != start) {
9084             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9085                     start,       end);
9086         }
9087         else {
9088             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9089         }
9090     }
9091
9092     return output;
9093 }
9094 #endif
9095
9096 #ifndef PERL_IN_XSUB_RE
9097 void
9098 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9099                          const char * const indent, SV* const invlist)
9100 {
9101     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9102      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9103      * the string 'indent'.  The output looks like this:
9104          [0] 0x000A .. 0x000D
9105          [2] 0x0085
9106          [4] 0x2028 .. 0x2029
9107          [6] 0x3104 .. INFINITY
9108      * This means that the first range of code points matched by the list are
9109      * 0xA through 0xD; the second range contains only the single code point
9110      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9111      * are used to define each range (except if the final range extends to
9112      * infinity, only a single element is needed).  The array index of the
9113      * first element for the corresponding range is given in brackets. */
9114
9115     UV start, end;
9116     STRLEN count = 0;
9117
9118     PERL_ARGS_ASSERT__INVLIST_DUMP;
9119
9120     if (invlist_is_iterating(invlist)) {
9121         Perl_dump_indent(aTHX_ level, file,
9122              "%sCan't dump inversion list because is in middle of iterating\n",
9123              indent);
9124         return;
9125     }
9126
9127     invlist_iterinit(invlist);
9128     while (invlist_iternext(invlist, &start, &end)) {
9129         if (end == UV_MAX) {
9130             Perl_dump_indent(aTHX_ level, file,
9131                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9132                                    indent, (UV)count, start);
9133         }
9134         else if (end != start) {
9135             Perl_dump_indent(aTHX_ level, file,
9136                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9137                                 indent, (UV)count, start,         end);
9138         }
9139         else {
9140             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9141                                             indent, (UV)count, start);
9142         }
9143         count += 2;
9144     }
9145 }
9146
9147 void
9148 Perl__load_PL_utf8_foldclosures (pTHX)
9149 {
9150     assert(! PL_utf8_foldclosures);
9151
9152     /* If the folds haven't been read in, call a fold function
9153      * to force that */
9154     if (! PL_utf8_tofold) {
9155         U8 dummy[UTF8_MAXBYTES_CASE+1];
9156
9157         /* This string is just a short named one above \xff */
9158         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9159         assert(PL_utf8_tofold); /* Verify that worked */
9160     }
9161     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9162 }
9163 #endif
9164
9165 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9166 bool
9167 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9168 {
9169     /* Return a boolean as to if the two passed in inversion lists are
9170      * identical.  The final argument, if TRUE, says to take the complement of
9171      * the second inversion list before doing the comparison */
9172
9173     const UV* array_a = invlist_array(a);
9174     const UV* array_b = invlist_array(b);
9175     UV len_a = _invlist_len(a);
9176     UV len_b = _invlist_len(b);
9177
9178     UV i = 0;               /* current index into the arrays */
9179     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9180
9181     PERL_ARGS_ASSERT__INVLISTEQ;
9182
9183     /* If are to compare 'a' with the complement of b, set it
9184      * up so are looking at b's complement. */
9185     if (complement_b) {
9186
9187         /* The complement of nothing is everything, so <a> would have to have
9188          * just one element, starting at zero (ending at infinity) */
9189         if (len_b == 0) {
9190             return (len_a == 1 && array_a[0] == 0);
9191         }
9192         else if (array_b[0] == 0) {
9193
9194             /* Otherwise, to complement, we invert.  Here, the first element is
9195              * 0, just remove it.  To do this, we just pretend the array starts
9196              * one later */
9197
9198             array_b++;
9199             len_b--;
9200         }
9201         else {
9202
9203             /* But if the first element is not zero, we pretend the list starts
9204              * at the 0 that is always stored immediately before the array. */
9205             array_b--;
9206             len_b++;
9207         }
9208     }
9209
9210     /* Make sure that the lengths are the same, as well as the final element
9211      * before looping through the remainder.  (Thus we test the length, final,
9212      * and first elements right off the bat) */
9213     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9214         retval = FALSE;
9215     }
9216     else for (i = 0; i < len_a - 1; i++) {
9217         if (array_a[i] != array_b[i]) {
9218             retval = FALSE;
9219             break;
9220         }
9221     }
9222
9223     return retval;
9224 }
9225 #endif
9226
9227 #undef HEADER_LENGTH
9228 #undef TO_INTERNAL_SIZE
9229 #undef FROM_INTERNAL_SIZE
9230 #undef INVLIST_VERSION_ID
9231
9232 /* End of inversion list object */
9233
9234 STATIC void
9235 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9236 {
9237     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9238      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9239      * should point to the first flag; it is updated on output to point to the
9240      * final ')' or ':'.  There needs to be at least one flag, or this will
9241      * abort */
9242
9243     /* for (?g), (?gc), and (?o) warnings; warning
9244        about (?c) will warn about (?g) -- japhy    */
9245
9246 #define WASTED_O  0x01
9247 #define WASTED_G  0x02
9248 #define WASTED_C  0x04
9249 #define WASTED_GC (WASTED_G|WASTED_C)
9250     I32 wastedflags = 0x00;
9251     U32 posflags = 0, negflags = 0;
9252     U32 *flagsp = &posflags;
9253     char has_charset_modifier = '\0';
9254     regex_charset cs;
9255     bool has_use_defaults = FALSE;
9256     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9257
9258     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9259
9260     /* '^' as an initial flag sets certain defaults */
9261     if (UCHARAT(RExC_parse) == '^') {
9262         RExC_parse++;
9263         has_use_defaults = TRUE;
9264         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9265         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9266                                         ? REGEX_UNICODE_CHARSET
9267                                         : REGEX_DEPENDS_CHARSET);
9268     }
9269
9270     cs = get_regex_charset(RExC_flags);
9271     if (cs == REGEX_DEPENDS_CHARSET
9272         && (RExC_utf8 || RExC_uni_semantics))
9273     {
9274         cs = REGEX_UNICODE_CHARSET;
9275     }
9276
9277     while (*RExC_parse) {
9278         /* && strchr("iogcmsx", *RExC_parse) */
9279         /* (?g), (?gc) and (?o) are useless here
9280            and must be globally applied -- japhy */
9281         switch (*RExC_parse) {
9282
9283             /* Code for the imsx flags */
9284             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9285
9286             case LOCALE_PAT_MOD:
9287                 if (has_charset_modifier) {
9288                     goto excess_modifier;
9289                 }
9290                 else if (flagsp == &negflags) {
9291                     goto neg_modifier;
9292                 }
9293                 cs = REGEX_LOCALE_CHARSET;
9294                 has_charset_modifier = LOCALE_PAT_MOD;
9295                 break;
9296             case UNICODE_PAT_MOD:
9297                 if (has_charset_modifier) {
9298                     goto excess_modifier;
9299                 }
9300                 else if (flagsp == &negflags) {
9301                     goto neg_modifier;
9302                 }
9303                 cs = REGEX_UNICODE_CHARSET;
9304                 has_charset_modifier = UNICODE_PAT_MOD;
9305                 break;
9306             case ASCII_RESTRICT_PAT_MOD:
9307                 if (flagsp == &negflags) {
9308                     goto neg_modifier;
9309                 }
9310                 if (has_charset_modifier) {
9311                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9312                         goto excess_modifier;
9313                     }
9314                     /* Doubled modifier implies more restricted */
9315                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9316                 }
9317                 else {
9318                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9319                 }
9320                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9321                 break;
9322             case DEPENDS_PAT_MOD:
9323                 if (has_use_defaults) {
9324                     goto fail_modifiers;
9325                 }
9326                 else if (flagsp == &negflags) {
9327                     goto neg_modifier;
9328                 }
9329                 else if (has_charset_modifier) {
9330                     goto excess_modifier;
9331                 }
9332
9333                 /* The dual charset means unicode semantics if the
9334                  * pattern (or target, not known until runtime) are
9335                  * utf8, or something in the pattern indicates unicode
9336                  * semantics */
9337                 cs = (RExC_utf8 || RExC_uni_semantics)
9338                      ? REGEX_UNICODE_CHARSET
9339                      : REGEX_DEPENDS_CHARSET;
9340                 has_charset_modifier = DEPENDS_PAT_MOD;
9341                 break;
9342             excess_modifier:
9343                 RExC_parse++;
9344                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9345                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9346                 }
9347                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9348                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9349                                         *(RExC_parse - 1));
9350                 }
9351                 else {
9352                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9353                 }
9354                 /*NOTREACHED*/
9355             neg_modifier:
9356                 RExC_parse++;
9357                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9358                                     *(RExC_parse - 1));
9359                 /*NOTREACHED*/
9360             case ONCE_PAT_MOD: /* 'o' */
9361             case GLOBAL_PAT_MOD: /* 'g' */
9362                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9363                     const I32 wflagbit = *RExC_parse == 'o'
9364                                          ? WASTED_O
9365                                          : WASTED_G;
9366                     if (! (wastedflags & wflagbit) ) {
9367                         wastedflags |= wflagbit;
9368                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9369                         vWARN5(
9370                             RExC_parse + 1,
9371                             "Useless (%s%c) - %suse /%c modifier",
9372                             flagsp == &negflags ? "?-" : "?",
9373                             *RExC_parse,
9374                             flagsp == &negflags ? "don't " : "",
9375                             *RExC_parse
9376                         );
9377                     }
9378                 }
9379                 break;
9380
9381             case CONTINUE_PAT_MOD: /* 'c' */
9382                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9383                     if (! (wastedflags & WASTED_C) ) {
9384                         wastedflags |= WASTED_GC;
9385                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9386                         vWARN3(
9387                             RExC_parse + 1,
9388                             "Useless (%sc) - %suse /gc modifier",
9389                             flagsp == &negflags ? "?-" : "?",
9390                             flagsp == &negflags ? "don't " : ""
9391                         );
9392                     }
9393                 }
9394                 break;
9395             case KEEPCOPY_PAT_MOD: /* 'p' */
9396                 if (flagsp == &negflags) {
9397                     if (SIZE_ONLY)
9398                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9399                 } else {
9400                     *flagsp |= RXf_PMf_KEEPCOPY;
9401                 }
9402                 break;
9403             case '-':
9404                 /* A flag is a default iff it is following a minus, so
9405                  * if there is a minus, it means will be trying to
9406                  * re-specify a default which is an error */
9407                 if (has_use_defaults || flagsp == &negflags) {
9408                     goto fail_modifiers;
9409                 }
9410                 flagsp = &negflags;
9411                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9412                 break;
9413             case ':':
9414             case ')':
9415                 RExC_flags |= posflags;
9416                 RExC_flags &= ~negflags;
9417                 set_regex_charset(&RExC_flags, cs);
9418                 if (RExC_flags & RXf_PMf_FOLD) {
9419                     RExC_contains_i = 1;
9420                 }
9421                 return;
9422                 /*NOTREACHED*/
9423             default:
9424             fail_modifiers:
9425                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9426                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9427                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9428                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9429                 /*NOTREACHED*/
9430         }
9431
9432         ++RExC_parse;
9433     }
9434 }
9435
9436 /*
9437  - reg - regular expression, i.e. main body or parenthesized thing
9438  *
9439  * Caller must absorb opening parenthesis.
9440  *
9441  * Combining parenthesis handling with the base level of regular expression
9442  * is a trifle forced, but the need to tie the tails of the branches to what
9443  * follows makes it hard to avoid.
9444  */
9445 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9446 #ifdef DEBUGGING
9447 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9448 #else
9449 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9450 #endif
9451
9452 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9453    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9454    needs to be restarted.
9455    Otherwise would only return NULL if regbranch() returns NULL, which
9456    cannot happen.  */
9457 STATIC regnode *
9458 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9459     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9460      * 2 is like 1, but indicates that nextchar() has been called to advance
9461      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9462      * this flag alerts us to the need to check for that */
9463 {
9464     dVAR;
9465     regnode *ret;               /* Will be the head of the group. */
9466     regnode *br;
9467     regnode *lastbr;
9468     regnode *ender = NULL;
9469     I32 parno = 0;
9470     I32 flags;
9471     U32 oregflags = RExC_flags;
9472     bool have_branch = 0;
9473     bool is_open = 0;
9474     I32 freeze_paren = 0;
9475     I32 after_freeze = 0;
9476     I32 num; /* numeric backreferences */
9477
9478     char * parse_start = RExC_parse; /* MJD */
9479     char * const oregcomp_parse = RExC_parse;
9480
9481     GET_RE_DEBUG_FLAGS_DECL;
9482
9483     PERL_ARGS_ASSERT_REG;
9484     DEBUG_PARSE("reg ");
9485
9486     *flagp = 0;                         /* Tentatively. */
9487
9488
9489     /* Make an OPEN node, if parenthesized. */
9490     if (paren) {
9491
9492         /* Under /x, space and comments can be gobbled up between the '(' and
9493          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9494          * intervening space, as the sequence is a token, and a token should be
9495          * indivisible */
9496         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9497
9498         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9499             char *start_verb = RExC_parse;
9500             STRLEN verb_len = 0;
9501             char *start_arg = NULL;
9502             unsigned char op = 0;
9503             int argok = 1;
9504             int internal_argval = 0; /* internal_argval is only useful if
9505                                         !argok */
9506
9507             if (has_intervening_patws) {
9508                 RExC_parse++;
9509                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9510             }
9511             while ( *RExC_parse && *RExC_parse != ')' ) {
9512                 if ( *RExC_parse == ':' ) {
9513                     start_arg = RExC_parse + 1;
9514                     break;
9515                 }
9516                 RExC_parse++;
9517             }
9518             ++start_verb;
9519             verb_len = RExC_parse - start_verb;
9520             if ( start_arg ) {
9521                 RExC_parse++;
9522                 while ( *RExC_parse && *RExC_parse != ')' )
9523                     RExC_parse++;
9524                 if ( *RExC_parse != ')' )
9525                     vFAIL("Unterminated verb pattern argument");
9526                 if ( RExC_parse == start_arg )
9527                     start_arg = NULL;
9528             } else {
9529                 if ( *RExC_parse != ')' )
9530                     vFAIL("Unterminated verb pattern");
9531             }
9532
9533             switch ( *start_verb ) {
9534             case 'A':  /* (*ACCEPT) */
9535                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9536                     op = ACCEPT;
9537                     internal_argval = RExC_nestroot;
9538                 }
9539                 break;
9540             case 'C':  /* (*COMMIT) */
9541                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9542                     op = COMMIT;
9543                 break;
9544             case 'F':  /* (*FAIL) */
9545                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9546                     op = OPFAIL;
9547                     argok = 0;
9548                 }
9549                 break;
9550             case ':':  /* (*:NAME) */
9551             case 'M':  /* (*MARK:NAME) */
9552                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9553                     op = MARKPOINT;
9554                     argok = -1;
9555                 }
9556                 break;
9557             case 'P':  /* (*PRUNE) */
9558                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9559                     op = PRUNE;
9560                 break;
9561             case 'S':   /* (*SKIP) */
9562                 if ( memEQs(start_verb,verb_len,"SKIP") )
9563                     op = SKIP;
9564                 break;
9565             case 'T':  /* (*THEN) */
9566                 /* [19:06] <TimToady> :: is then */
9567                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9568                     op = CUTGROUP;
9569                     RExC_seen |= REG_CUTGROUP_SEEN;
9570                 }
9571                 break;
9572             }
9573             if ( ! op ) {
9574                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9575                 vFAIL2utf8f(
9576                     "Unknown verb pattern '%"UTF8f"'",
9577                     UTF8fARG(UTF, verb_len, start_verb));
9578             }
9579             if ( argok ) {
9580                 if ( start_arg && internal_argval ) {
9581                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9582                         verb_len, start_verb);
9583                 } else if ( argok < 0 && !start_arg ) {
9584                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9585                         verb_len, start_verb);
9586                 } else {
9587                     ret = reganode(pRExC_state, op, internal_argval);
9588                     if ( ! internal_argval && ! SIZE_ONLY ) {
9589                         if (start_arg) {
9590                             SV *sv = newSVpvn( start_arg,
9591                                                RExC_parse - start_arg);
9592                             ARG(ret) = add_data( pRExC_state,
9593                                                  STR_WITH_LEN("S"));
9594                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9595                             ret->flags = 0;
9596                         } else {
9597                             ret->flags = 1;
9598                         }
9599                     }
9600                 }
9601                 if (!internal_argval)
9602                     RExC_seen |= REG_VERBARG_SEEN;
9603             } else if ( start_arg ) {
9604                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9605                         verb_len, start_verb);
9606             } else {
9607                 ret = reg_node(pRExC_state, op);
9608             }
9609             nextchar(pRExC_state);
9610             return ret;
9611         }
9612         else if (*RExC_parse == '?') { /* (?...) */
9613             bool is_logical = 0;
9614             const char * const seqstart = RExC_parse;
9615             if (has_intervening_patws) {
9616                 RExC_parse++;
9617                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9618             }
9619
9620             RExC_parse++;
9621             paren = *RExC_parse++;
9622             ret = NULL;                 /* For look-ahead/behind. */
9623             switch (paren) {
9624
9625             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9626                 paren = *RExC_parse++;
9627                 if ( paren == '<')         /* (?P<...>) named capture */
9628                     goto named_capture;
9629                 else if (paren == '>') {   /* (?P>name) named recursion */
9630                     goto named_recursion;
9631                 }
9632                 else if (paren == '=') {   /* (?P=...)  named backref */
9633                     /* this pretty much dupes the code for \k<NAME> in
9634                      * regatom(), if you change this make sure you change that
9635                      * */
9636                     char* name_start = RExC_parse;
9637                     U32 num = 0;
9638                     SV *sv_dat = reg_scan_name(pRExC_state,
9639                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9640                     if (RExC_parse == name_start || *RExC_parse != ')')
9641                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9642                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9643
9644                     if (!SIZE_ONLY) {
9645                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9646                         RExC_rxi->data->data[num]=(void*)sv_dat;
9647                         SvREFCNT_inc_simple_void(sv_dat);
9648                     }
9649                     RExC_sawback = 1;
9650                     ret = reganode(pRExC_state,
9651                                    ((! FOLD)
9652                                      ? NREF
9653                                      : (ASCII_FOLD_RESTRICTED)
9654                                        ? NREFFA
9655                                        : (AT_LEAST_UNI_SEMANTICS)
9656                                          ? NREFFU
9657                                          : (LOC)
9658                                            ? NREFFL
9659                                            : NREFF),
9660                                     num);
9661                     *flagp |= HASWIDTH;
9662
9663                     Set_Node_Offset(ret, parse_start+1);
9664                     Set_Node_Cur_Length(ret, parse_start);
9665
9666                     nextchar(pRExC_state);
9667                     return ret;
9668                 }
9669                 RExC_parse++;
9670                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9671                 vFAIL3("Sequence (%.*s...) not recognized",
9672                                 RExC_parse-seqstart, seqstart);
9673                 /*NOTREACHED*/
9674             case '<':           /* (?<...) */
9675                 if (*RExC_parse == '!')
9676                     paren = ',';
9677                 else if (*RExC_parse != '=')
9678               named_capture:
9679                 {               /* (?<...>) */
9680                     char *name_start;
9681                     SV *svname;
9682                     paren= '>';
9683             case '\'':          /* (?'...') */
9684                     name_start= RExC_parse;
9685                     svname = reg_scan_name(pRExC_state,
9686                         SIZE_ONLY    /* reverse test from the others */
9687                         ? REG_RSN_RETURN_NAME
9688                         : REG_RSN_RETURN_NULL);
9689                     if (RExC_parse == name_start || *RExC_parse != paren)
9690                         vFAIL2("Sequence (?%c... not terminated",
9691                             paren=='>' ? '<' : paren);
9692                     if (SIZE_ONLY) {
9693                         HE *he_str;
9694                         SV *sv_dat = NULL;
9695                         if (!svname) /* shouldn't happen */
9696                             Perl_croak(aTHX_
9697                                 "panic: reg_scan_name returned NULL");
9698                         if (!RExC_paren_names) {
9699                             RExC_paren_names= newHV();
9700                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9701 #ifdef DEBUGGING
9702                             RExC_paren_name_list= newAV();
9703                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9704 #endif
9705                         }
9706                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9707                         if ( he_str )
9708                             sv_dat = HeVAL(he_str);
9709                         if ( ! sv_dat ) {
9710                             /* croak baby croak */
9711                             Perl_croak(aTHX_
9712                                 "panic: paren_name hash element allocation failed");
9713                         } else if ( SvPOK(sv_dat) ) {
9714                             /* (?|...) can mean we have dupes so scan to check
9715                                its already been stored. Maybe a flag indicating
9716                                we are inside such a construct would be useful,
9717                                but the arrays are likely to be quite small, so
9718                                for now we punt -- dmq */
9719                             IV count = SvIV(sv_dat);
9720                             I32 *pv = (I32*)SvPVX(sv_dat);
9721                             IV i;
9722                             for ( i = 0 ; i < count ; i++ ) {
9723                                 if ( pv[i] == RExC_npar ) {
9724                                     count = 0;
9725                                     break;
9726                                 }
9727                             }
9728                             if ( count ) {
9729                                 pv = (I32*)SvGROW(sv_dat,
9730                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9731                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9732                                 pv[count] = RExC_npar;
9733                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9734                             }
9735                         } else {
9736                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9737                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9738                                                                 sizeof(I32));
9739                             SvIOK_on(sv_dat);
9740                             SvIV_set(sv_dat, 1);
9741                         }
9742 #ifdef DEBUGGING
9743                         /* Yes this does cause a memory leak in debugging Perls
9744                          * */
9745                         if (!av_store(RExC_paren_name_list,
9746                                       RExC_npar, SvREFCNT_inc(svname)))
9747                             SvREFCNT_dec_NN(svname);
9748 #endif
9749
9750                         /*sv_dump(sv_dat);*/
9751                     }
9752                     nextchar(pRExC_state);
9753                     paren = 1;
9754                     goto capturing_parens;
9755                 }
9756                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9757                 RExC_in_lookbehind++;
9758                 RExC_parse++;
9759                 /* FALLTHROUGH */
9760             case '=':           /* (?=...) */
9761                 RExC_seen_zerolen++;
9762                 break;
9763             case '!':           /* (?!...) */
9764                 RExC_seen_zerolen++;
9765                 if (*RExC_parse == ')') {
9766                     ret=reg_node(pRExC_state, OPFAIL);
9767                     nextchar(pRExC_state);
9768                     return ret;
9769                 }
9770                 break;
9771             case '|':           /* (?|...) */
9772                 /* branch reset, behave like a (?:...) except that
9773                    buffers in alternations share the same numbers */
9774                 paren = ':';
9775                 after_freeze = freeze_paren = RExC_npar;
9776                 break;
9777             case ':':           /* (?:...) */
9778             case '>':           /* (?>...) */
9779                 break;
9780             case '$':           /* (?$...) */
9781             case '@':           /* (?@...) */
9782                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9783                 break;
9784             case '0' :           /* (?0) */
9785             case 'R' :           /* (?R) */
9786                 if (*RExC_parse != ')')
9787                     FAIL("Sequence (?R) not terminated");
9788                 ret = reg_node(pRExC_state, GOSTART);
9789                     RExC_seen |= REG_GOSTART_SEEN;
9790                 *flagp |= POSTPONED;
9791                 nextchar(pRExC_state);
9792                 return ret;
9793                 /*notreached*/
9794             /* named and numeric backreferences */
9795             case '&':            /* (?&NAME) */
9796                 parse_start = RExC_parse - 1;
9797               named_recursion:
9798                 {
9799                     SV *sv_dat = reg_scan_name(pRExC_state,
9800                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9801                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9802                 }
9803                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9804                     vFAIL("Sequence (?&... not terminated");
9805                 goto gen_recurse_regop;
9806                 assert(0); /* NOT REACHED */
9807             case '+':
9808                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9809                     RExC_parse++;
9810                     vFAIL("Illegal pattern");
9811                 }
9812                 goto parse_recursion;
9813                 /* NOT REACHED*/
9814             case '-': /* (?-1) */
9815                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9816                     RExC_parse--; /* rewind to let it be handled later */
9817                     goto parse_flags;
9818                 }
9819                 /* FALLTHROUGH */
9820             case '1': case '2': case '3': case '4': /* (?1) */
9821             case '5': case '6': case '7': case '8': case '9':
9822                 RExC_parse--;
9823               parse_recursion:
9824                 num = atoi(RExC_parse);
9825                 parse_start = RExC_parse - 1; /* MJD */
9826                 if (*RExC_parse == '-')
9827                     RExC_parse++;
9828                 while (isDIGIT(*RExC_parse))
9829                         RExC_parse++;
9830                 if (*RExC_parse!=')')
9831                     vFAIL("Expecting close bracket");
9832
9833               gen_recurse_regop:
9834                 if ( paren == '-' ) {
9835                     /*
9836                     Diagram of capture buffer numbering.
9837                     Top line is the normal capture buffer numbers
9838                     Bottom line is the negative indexing as from
9839                     the X (the (?-2))
9840
9841                     +   1 2    3 4 5 X          6 7
9842                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9843                     -   5 4    3 2 1 X          x x
9844
9845                     */
9846                     num = RExC_npar + num;
9847                     if (num < 1)  {
9848                         RExC_parse++;
9849                         vFAIL("Reference to nonexistent group");
9850                     }
9851                 } else if ( paren == '+' ) {
9852                     num = RExC_npar + num - 1;
9853                 }
9854
9855                 ret = reganode(pRExC_state, GOSUB, num);
9856                 if (!SIZE_ONLY) {
9857                     if (num > (I32)RExC_rx->nparens) {
9858                         RExC_parse++;
9859                         vFAIL("Reference to nonexistent group");
9860                     }
9861                     ARG2L_SET( ret, RExC_recurse_count++);
9862                     RExC_emit++;
9863                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9864                         "Recurse #%"UVuf" to %"IVdf"\n",
9865                               (UV)ARG(ret), (IV)ARG2L(ret)));
9866                 } else {
9867                     RExC_size++;
9868                 }
9869                     RExC_seen |= REG_RECURSE_SEEN;
9870                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9871                 Set_Node_Offset(ret, parse_start); /* MJD */
9872
9873                 *flagp |= POSTPONED;
9874                 nextchar(pRExC_state);
9875                 return ret;
9876
9877             assert(0); /* NOT REACHED */
9878
9879             case '?':           /* (??...) */
9880                 is_logical = 1;
9881                 if (*RExC_parse != '{') {
9882                     RExC_parse++;
9883                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9884                     vFAIL2utf8f(
9885                         "Sequence (%"UTF8f"...) not recognized",
9886                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9887                     /*NOTREACHED*/
9888                 }
9889                 *flagp |= POSTPONED;
9890                 paren = *RExC_parse++;
9891                 /* FALLTHROUGH */
9892             case '{':           /* (?{...}) */
9893             {
9894                 U32 n = 0;
9895                 struct reg_code_block *cb;
9896
9897                 RExC_seen_zerolen++;
9898
9899                 if (   !pRExC_state->num_code_blocks
9900                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9901                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9902                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9903                             - RExC_start)
9904                 ) {
9905                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9906                         FAIL("panic: Sequence (?{...}): no code block found\n");
9907                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9908                 }
9909                 /* this is a pre-compiled code block (?{...}) */
9910                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9911                 RExC_parse = RExC_start + cb->end;
9912                 if (!SIZE_ONLY) {
9913                     OP *o = cb->block;
9914                     if (cb->src_regex) {
9915                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9916                         RExC_rxi->data->data[n] =
9917                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9918                         RExC_rxi->data->data[n+1] = (void*)o;
9919                     }
9920                     else {
9921                         n = add_data(pRExC_state,
9922                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9923                         RExC_rxi->data->data[n] = (void*)o;
9924                     }
9925                 }
9926                 pRExC_state->code_index++;
9927                 nextchar(pRExC_state);
9928
9929                 if (is_logical) {
9930                     regnode *eval;
9931                     ret = reg_node(pRExC_state, LOGICAL);
9932                     eval = reganode(pRExC_state, EVAL, n);
9933                     if (!SIZE_ONLY) {
9934                         ret->flags = 2;
9935                         /* for later propagation into (??{}) return value */
9936                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9937                     }
9938                     REGTAIL(pRExC_state, ret, eval);
9939                     /* deal with the length of this later - MJD */
9940                     return ret;
9941                 }
9942                 ret = reganode(pRExC_state, EVAL, n);
9943                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9944                 Set_Node_Offset(ret, parse_start);
9945                 return ret;
9946             }
9947             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9948             {
9949                 int is_define= 0;
9950                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9951                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9952                         || RExC_parse[1] == '<'
9953                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9954                         I32 flag;
9955                         regnode *tail;
9956
9957                         ret = reg_node(pRExC_state, LOGICAL);
9958                         if (!SIZE_ONLY)
9959                             ret->flags = 1;
9960
9961                         tail = reg(pRExC_state, 1, &flag, depth+1);
9962                         if (flag & RESTART_UTF8) {
9963                             *flagp = RESTART_UTF8;
9964                             return NULL;
9965                         }
9966                         REGTAIL(pRExC_state, ret, tail);
9967                         goto insert_if;
9968                     }
9969                 }
9970                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9971                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9972                 {
9973                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9974                     char *name_start= RExC_parse++;
9975                     U32 num = 0;
9976                     SV *sv_dat=reg_scan_name(pRExC_state,
9977                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9978                     if (RExC_parse == name_start || *RExC_parse != ch)
9979                         vFAIL2("Sequence (?(%c... not terminated",
9980                             (ch == '>' ? '<' : ch));
9981                     RExC_parse++;
9982                     if (!SIZE_ONLY) {
9983                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9984                         RExC_rxi->data->data[num]=(void*)sv_dat;
9985                         SvREFCNT_inc_simple_void(sv_dat);
9986                     }
9987                     ret = reganode(pRExC_state,NGROUPP,num);
9988                     goto insert_if_check_paren;
9989                 }
9990                 else if (RExC_parse[0] == 'D' &&
9991                          RExC_parse[1] == 'E' &&
9992                          RExC_parse[2] == 'F' &&
9993                          RExC_parse[3] == 'I' &&
9994                          RExC_parse[4] == 'N' &&
9995                          RExC_parse[5] == 'E')
9996                 {
9997                     ret = reganode(pRExC_state,DEFINEP,0);
9998                     RExC_parse +=6 ;
9999                     is_define = 1;
10000                     goto insert_if_check_paren;
10001                 }
10002                 else if (RExC_parse[0] == 'R') {
10003                     RExC_parse++;
10004                     parno = 0;
10005                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10006                         parno = atoi(RExC_parse++);
10007                         while (isDIGIT(*RExC_parse))
10008                             RExC_parse++;
10009                     } else if (RExC_parse[0] == '&') {
10010                         SV *sv_dat;
10011                         RExC_parse++;
10012                         sv_dat = reg_scan_name(pRExC_state,
10013                             SIZE_ONLY
10014                             ? REG_RSN_RETURN_NULL
10015                             : REG_RSN_RETURN_DATA);
10016                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10017                     }
10018                     ret = reganode(pRExC_state,INSUBP,parno);
10019                     goto insert_if_check_paren;
10020                 }
10021                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10022                     /* (?(1)...) */
10023                     char c;
10024                     char *tmp;
10025                     parno = atoi(RExC_parse++);
10026
10027                     while (isDIGIT(*RExC_parse))
10028                         RExC_parse++;
10029                     ret = reganode(pRExC_state, GROUPP, parno);
10030
10031                  insert_if_check_paren:
10032                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10033                         /* nextchar also skips comments, so undo its work
10034                          * and skip over the the next character.
10035                          */
10036                         RExC_parse = tmp;
10037                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10038                         vFAIL("Switch condition not recognized");
10039                     }
10040                   insert_if:
10041                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10042                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10043                     if (br == NULL) {
10044                         if (flags & RESTART_UTF8) {
10045                             *flagp = RESTART_UTF8;
10046                             return NULL;
10047                         }
10048                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10049                               (UV) flags);
10050                     } else
10051                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10052                                                           LONGJMP, 0));
10053                     c = *nextchar(pRExC_state);
10054                     if (flags&HASWIDTH)
10055                         *flagp |= HASWIDTH;
10056                     if (c == '|') {
10057                         if (is_define)
10058                             vFAIL("(?(DEFINE)....) does not allow branches");
10059
10060                         /* Fake one for optimizer.  */
10061                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10062
10063                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10064                             if (flags & RESTART_UTF8) {
10065                                 *flagp = RESTART_UTF8;
10066                                 return NULL;
10067                             }
10068                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10069                                   (UV) flags);
10070                         }
10071                         REGTAIL(pRExC_state, ret, lastbr);
10072                         if (flags&HASWIDTH)
10073                             *flagp |= HASWIDTH;
10074                         c = *nextchar(pRExC_state);
10075                     }
10076                     else
10077                         lastbr = NULL;
10078                     if (c != ')')
10079                         vFAIL("Switch (?(condition)... contains too many branches");
10080                     ender = reg_node(pRExC_state, TAIL);
10081                     REGTAIL(pRExC_state, br, ender);
10082                     if (lastbr) {
10083                         REGTAIL(pRExC_state, lastbr, ender);
10084                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10085                     }
10086                     else
10087                         REGTAIL(pRExC_state, ret, ender);
10088                     RExC_size++; /* XXX WHY do we need this?!!
10089                                     For large programs it seems to be required
10090                                     but I can't figure out why. -- dmq*/
10091                     return ret;
10092                 }
10093                 else {
10094                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10095                     vFAIL("Unknown switch condition (?(...))");
10096                 }
10097             }
10098             case '[':           /* (?[ ... ]) */
10099                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10100                                          oregcomp_parse);
10101             case 0:
10102                 RExC_parse--; /* for vFAIL to print correctly */
10103                 vFAIL("Sequence (? incomplete");
10104                 break;
10105             default: /* e.g., (?i) */
10106                 --RExC_parse;
10107               parse_flags:
10108                 parse_lparen_question_flags(pRExC_state);
10109                 if (UCHARAT(RExC_parse) != ':') {
10110                     nextchar(pRExC_state);
10111                     *flagp = TRYAGAIN;
10112                     return NULL;
10113                 }
10114                 paren = ':';
10115                 nextchar(pRExC_state);
10116                 ret = NULL;
10117                 goto parse_rest;
10118             } /* end switch */
10119         }
10120         else {                  /* (...) */
10121           capturing_parens:
10122             parno = RExC_npar;
10123             RExC_npar++;
10124
10125             ret = reganode(pRExC_state, OPEN, parno);
10126             if (!SIZE_ONLY ){
10127                 if (!RExC_nestroot)
10128                     RExC_nestroot = parno;
10129                 if (RExC_seen & REG_RECURSE_SEEN
10130                     && !RExC_open_parens[parno-1])
10131                 {
10132                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10133                         "Setting open paren #%"IVdf" to %d\n",
10134                         (IV)parno, REG_NODE_NUM(ret)));
10135                     RExC_open_parens[parno-1]= ret;
10136                 }
10137             }
10138             Set_Node_Length(ret, 1); /* MJD */
10139             Set_Node_Offset(ret, RExC_parse); /* MJD */
10140             is_open = 1;
10141         }
10142     }
10143     else                        /* ! paren */
10144         ret = NULL;
10145
10146    parse_rest:
10147     /* Pick up the branches, linking them together. */
10148     parse_start = RExC_parse;   /* MJD */
10149     br = regbranch(pRExC_state, &flags, 1,depth+1);
10150
10151     /*     branch_len = (paren != 0); */
10152
10153     if (br == NULL) {
10154         if (flags & RESTART_UTF8) {
10155             *flagp = RESTART_UTF8;
10156             return NULL;
10157         }
10158         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10159     }
10160     if (*RExC_parse == '|') {
10161         if (!SIZE_ONLY && RExC_extralen) {
10162             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10163         }
10164         else {                  /* MJD */
10165             reginsert(pRExC_state, BRANCH, br, depth+1);
10166             Set_Node_Length(br, paren != 0);
10167             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10168         }
10169         have_branch = 1;
10170         if (SIZE_ONLY)
10171             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10172     }
10173     else if (paren == ':') {
10174         *flagp |= flags&SIMPLE;
10175     }
10176     if (is_open) {                              /* Starts with OPEN. */
10177         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10178     }
10179     else if (paren != '?')              /* Not Conditional */
10180         ret = br;
10181     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10182     lastbr = br;
10183     while (*RExC_parse == '|') {
10184         if (!SIZE_ONLY && RExC_extralen) {
10185             ender = reganode(pRExC_state, LONGJMP,0);
10186
10187             /* Append to the previous. */
10188             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10189         }
10190         if (SIZE_ONLY)
10191             RExC_extralen += 2;         /* Account for LONGJMP. */
10192         nextchar(pRExC_state);
10193         if (freeze_paren) {
10194             if (RExC_npar > after_freeze)
10195                 after_freeze = RExC_npar;
10196             RExC_npar = freeze_paren;
10197         }
10198         br = regbranch(pRExC_state, &flags, 0, depth+1);
10199
10200         if (br == NULL) {
10201             if (flags & RESTART_UTF8) {
10202                 *flagp = RESTART_UTF8;
10203                 return NULL;
10204             }
10205             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10206         }
10207         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10208         lastbr = br;
10209         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10210     }
10211
10212     if (have_branch || paren != ':') {
10213         /* Make a closing node, and hook it on the end. */
10214         switch (paren) {
10215         case ':':
10216             ender = reg_node(pRExC_state, TAIL);
10217             break;
10218         case 1: case 2:
10219             ender = reganode(pRExC_state, CLOSE, parno);
10220             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10221                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10222                         "Setting close paren #%"IVdf" to %d\n",
10223                         (IV)parno, REG_NODE_NUM(ender)));
10224                 RExC_close_parens[parno-1]= ender;
10225                 if (RExC_nestroot == parno)
10226                     RExC_nestroot = 0;
10227             }
10228             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10229             Set_Node_Length(ender,1); /* MJD */
10230             break;
10231         case '<':
10232         case ',':
10233         case '=':
10234         case '!':
10235             *flagp &= ~HASWIDTH;
10236             /* FALLTHROUGH */
10237         case '>':
10238             ender = reg_node(pRExC_state, SUCCEED);
10239             break;
10240         case 0:
10241             ender = reg_node(pRExC_state, END);
10242             if (!SIZE_ONLY) {
10243                 assert(!RExC_opend); /* there can only be one! */
10244                 RExC_opend = ender;
10245             }
10246             break;
10247         }
10248         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10249             SV * const mysv_val1=sv_newmortal();
10250             SV * const mysv_val2=sv_newmortal();
10251             DEBUG_PARSE_MSG("lsbr");
10252             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10253             regprop(RExC_rx, mysv_val2, ender, NULL);
10254             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10255                           SvPV_nolen_const(mysv_val1),
10256                           (IV)REG_NODE_NUM(lastbr),
10257                           SvPV_nolen_const(mysv_val2),
10258                           (IV)REG_NODE_NUM(ender),
10259                           (IV)(ender - lastbr)
10260             );
10261         });
10262         REGTAIL(pRExC_state, lastbr, ender);
10263
10264         if (have_branch && !SIZE_ONLY) {
10265             char is_nothing= 1;
10266             if (depth==1)
10267                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10268
10269             /* Hook the tails of the branches to the closing node. */
10270             for (br = ret; br; br = regnext(br)) {
10271                 const U8 op = PL_regkind[OP(br)];
10272                 if (op == BRANCH) {
10273                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10274                     if ( OP(NEXTOPER(br)) != NOTHING
10275                          || regnext(NEXTOPER(br)) != ender)
10276                         is_nothing= 0;
10277                 }
10278                 else if (op == BRANCHJ) {
10279                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10280                     /* for now we always disable this optimisation * /
10281                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10282                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10283                     */
10284                         is_nothing= 0;
10285                 }
10286             }
10287             if (is_nothing) {
10288                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10289                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10290                     SV * const mysv_val1=sv_newmortal();
10291                     SV * const mysv_val2=sv_newmortal();
10292                     DEBUG_PARSE_MSG("NADA");
10293                     regprop(RExC_rx, mysv_val1, ret, NULL);
10294                     regprop(RExC_rx, mysv_val2, ender, NULL);
10295                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10296                                   SvPV_nolen_const(mysv_val1),
10297                                   (IV)REG_NODE_NUM(ret),
10298                                   SvPV_nolen_const(mysv_val2),
10299                                   (IV)REG_NODE_NUM(ender),
10300                                   (IV)(ender - ret)
10301                     );
10302                 });
10303                 OP(br)= NOTHING;
10304                 if (OP(ender) == TAIL) {
10305                     NEXT_OFF(br)= 0;
10306                     RExC_emit= br + 1;
10307                 } else {
10308                     regnode *opt;
10309                     for ( opt= br + 1; opt < ender ; opt++ )
10310                         OP(opt)= OPTIMIZED;
10311                     NEXT_OFF(br)= ender - br;
10312                 }
10313             }
10314         }
10315     }
10316
10317     {
10318         const char *p;
10319         static const char parens[] = "=!<,>";
10320
10321         if (paren && (p = strchr(parens, paren))) {
10322             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10323             int flag = (p - parens) > 1;
10324
10325             if (paren == '>')
10326                 node = SUSPEND, flag = 0;
10327             reginsert(pRExC_state, node,ret, depth+1);
10328             Set_Node_Cur_Length(ret, parse_start);
10329             Set_Node_Offset(ret, parse_start + 1);
10330             ret->flags = flag;
10331             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10332         }
10333     }
10334
10335     /* Check for proper termination. */
10336     if (paren) {
10337         /* restore original flags, but keep (?p) */
10338         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10339         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10340             RExC_parse = oregcomp_parse;
10341             vFAIL("Unmatched (");
10342         }
10343     }
10344     else if (!paren && RExC_parse < RExC_end) {
10345         if (*RExC_parse == ')') {
10346             RExC_parse++;
10347             vFAIL("Unmatched )");
10348         }
10349         else
10350             FAIL("Junk on end of regexp");      /* "Can't happen". */
10351         assert(0); /* NOTREACHED */
10352     }
10353
10354     if (RExC_in_lookbehind) {
10355         RExC_in_lookbehind--;
10356     }
10357     if (after_freeze > RExC_npar)
10358         RExC_npar = after_freeze;
10359     return(ret);
10360 }
10361
10362 /*
10363  - regbranch - one alternative of an | operator
10364  *
10365  * Implements the concatenation operator.
10366  *
10367  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10368  * restarted.
10369  */
10370 STATIC regnode *
10371 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10372 {
10373     dVAR;
10374     regnode *ret;
10375     regnode *chain = NULL;
10376     regnode *latest;
10377     I32 flags = 0, c = 0;
10378     GET_RE_DEBUG_FLAGS_DECL;
10379
10380     PERL_ARGS_ASSERT_REGBRANCH;
10381
10382     DEBUG_PARSE("brnc");
10383
10384     if (first)
10385         ret = NULL;
10386     else {
10387         if (!SIZE_ONLY && RExC_extralen)
10388             ret = reganode(pRExC_state, BRANCHJ,0);
10389         else {
10390             ret = reg_node(pRExC_state, BRANCH);
10391             Set_Node_Length(ret, 1);
10392         }
10393     }
10394
10395     if (!first && SIZE_ONLY)
10396         RExC_extralen += 1;                     /* BRANCHJ */
10397
10398     *flagp = WORST;                     /* Tentatively. */
10399
10400     RExC_parse--;
10401     nextchar(pRExC_state);
10402     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10403         flags &= ~TRYAGAIN;
10404         latest = regpiece(pRExC_state, &flags,depth+1);
10405         if (latest == NULL) {
10406             if (flags & TRYAGAIN)
10407                 continue;
10408             if (flags & RESTART_UTF8) {
10409                 *flagp = RESTART_UTF8;
10410                 return NULL;
10411             }
10412             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10413         }
10414         else if (ret == NULL)
10415             ret = latest;
10416         *flagp |= flags&(HASWIDTH|POSTPONED);
10417         if (chain == NULL)      /* First piece. */
10418             *flagp |= flags&SPSTART;
10419         else {
10420             RExC_naughty++;
10421             REGTAIL(pRExC_state, chain, latest);
10422         }
10423         chain = latest;
10424         c++;
10425     }
10426     if (chain == NULL) {        /* Loop ran zero times. */
10427         chain = reg_node(pRExC_state, NOTHING);
10428         if (ret == NULL)
10429             ret = chain;
10430     }
10431     if (c == 1) {
10432         *flagp |= flags&SIMPLE;
10433     }
10434
10435     return ret;
10436 }
10437
10438 /*
10439  - regpiece - something followed by possible [*+?]
10440  *
10441  * Note that the branching code sequences used for ? and the general cases
10442  * of * and + are somewhat optimized:  they use the same NOTHING node as
10443  * both the endmarker for their branch list and the body of the last branch.
10444  * It might seem that this node could be dispensed with entirely, but the
10445  * endmarker role is not redundant.
10446  *
10447  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10448  * TRYAGAIN.
10449  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10450  * restarted.
10451  */
10452 STATIC regnode *
10453 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10454 {
10455     dVAR;
10456     regnode *ret;
10457     char op;
10458     char *next;
10459     I32 flags;
10460     const char * const origparse = RExC_parse;
10461     I32 min;
10462     I32 max = REG_INFTY;
10463 #ifdef RE_TRACK_PATTERN_OFFSETS
10464     char *parse_start;
10465 #endif
10466     const char *maxpos = NULL;
10467
10468     /* Save the original in case we change the emitted regop to a FAIL. */
10469     regnode * const orig_emit = RExC_emit;
10470
10471     GET_RE_DEBUG_FLAGS_DECL;
10472
10473     PERL_ARGS_ASSERT_REGPIECE;
10474
10475     DEBUG_PARSE("piec");
10476
10477     ret = regatom(pRExC_state, &flags,depth+1);
10478     if (ret == NULL) {
10479         if (flags & (TRYAGAIN|RESTART_UTF8))
10480             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10481         else
10482             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10483         return(NULL);
10484     }
10485
10486     op = *RExC_parse;
10487
10488     if (op == '{' && regcurly(RExC_parse)) {
10489         maxpos = NULL;
10490 #ifdef RE_TRACK_PATTERN_OFFSETS
10491         parse_start = RExC_parse; /* MJD */
10492 #endif
10493         next = RExC_parse + 1;
10494         while (isDIGIT(*next) || *next == ',') {
10495             if (*next == ',') {
10496                 if (maxpos)
10497                     break;
10498                 else
10499                     maxpos = next;
10500             }
10501             next++;
10502         }
10503         if (*next == '}') {             /* got one */
10504             if (!maxpos)
10505                 maxpos = next;
10506             RExC_parse++;
10507             min = atoi(RExC_parse);
10508             if (*maxpos == ',')
10509                 maxpos++;
10510             else
10511                 maxpos = RExC_parse;
10512             max = atoi(maxpos);
10513             if (!max && *maxpos != '0')
10514                 max = REG_INFTY;                /* meaning "infinity" */
10515             else if (max >= REG_INFTY)
10516                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10517             RExC_parse = next;
10518             nextchar(pRExC_state);
10519             if (max < min) {    /* If can't match, warn and optimize to fail
10520                                    unconditionally */
10521                 if (SIZE_ONLY) {
10522                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10523
10524                     /* We can't back off the size because we have to reserve
10525                      * enough space for all the things we are about to throw
10526                      * away, but we can shrink it by the ammount we are about
10527                      * to re-use here */
10528                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10529                 }
10530                 else {
10531                     RExC_emit = orig_emit;
10532                 }
10533                 ret = reg_node(pRExC_state, OPFAIL);
10534                 return ret;
10535             }
10536             else if (min == max
10537                      && RExC_parse < RExC_end
10538                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10539             {
10540                 if (SIZE_ONLY) {
10541                     ckWARN2reg(RExC_parse + 1,
10542                                "Useless use of greediness modifier '%c'",
10543                                *RExC_parse);
10544                 }
10545                 /* Absorb the modifier, so later code doesn't see nor use
10546                     * it */
10547                 nextchar(pRExC_state);
10548             }
10549
10550         do_curly:
10551             if ((flags&SIMPLE)) {
10552                 RExC_naughty += 2 + RExC_naughty / 2;
10553                 reginsert(pRExC_state, CURLY, ret, depth+1);
10554                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10555                 Set_Node_Cur_Length(ret, parse_start);
10556             }
10557             else {
10558                 regnode * const w = reg_node(pRExC_state, WHILEM);
10559
10560                 w->flags = 0;
10561                 REGTAIL(pRExC_state, ret, w);
10562                 if (!SIZE_ONLY && RExC_extralen) {
10563                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10564                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10565                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10566                 }
10567                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10568                                 /* MJD hk */
10569                 Set_Node_Offset(ret, parse_start+1);
10570                 Set_Node_Length(ret,
10571                                 op == '{' ? (RExC_parse - parse_start) : 1);
10572
10573                 if (!SIZE_ONLY && RExC_extralen)
10574                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10575                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10576                 if (SIZE_ONLY)
10577                     RExC_whilem_seen++, RExC_extralen += 3;
10578                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10579             }
10580             ret->flags = 0;
10581
10582             if (min > 0)
10583                 *flagp = WORST;
10584             if (max > 0)
10585                 *flagp |= HASWIDTH;
10586             if (!SIZE_ONLY) {
10587                 ARG1_SET(ret, (U16)min);
10588                 ARG2_SET(ret, (U16)max);
10589             }
10590             if (max == REG_INFTY)
10591                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10592
10593             goto nest_check;
10594         }
10595     }
10596
10597     if (!ISMULT1(op)) {
10598         *flagp = flags;
10599         return(ret);
10600     }
10601
10602 #if 0                           /* Now runtime fix should be reliable. */
10603
10604     /* if this is reinstated, don't forget to put this back into perldiag:
10605
10606             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10607
10608            (F) The part of the regexp subject to either the * or + quantifier
10609            could match an empty string. The {#} shows in the regular
10610            expression about where the problem was discovered.
10611
10612     */
10613
10614     if (!(flags&HASWIDTH) && op != '?')
10615       vFAIL("Regexp *+ operand could be empty");
10616 #endif
10617
10618 #ifdef RE_TRACK_PATTERN_OFFSETS
10619     parse_start = RExC_parse;
10620 #endif
10621     nextchar(pRExC_state);
10622
10623     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10624
10625     if (op == '*' && (flags&SIMPLE)) {
10626         reginsert(pRExC_state, STAR, ret, depth+1);
10627         ret->flags = 0;
10628         RExC_naughty += 4;
10629         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10630     }
10631     else if (op == '*') {
10632         min = 0;
10633         goto do_curly;
10634     }
10635     else if (op == '+' && (flags&SIMPLE)) {
10636         reginsert(pRExC_state, PLUS, ret, depth+1);
10637         ret->flags = 0;
10638         RExC_naughty += 3;
10639         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10640     }
10641     else if (op == '+') {
10642         min = 1;
10643         goto do_curly;
10644     }
10645     else if (op == '?') {
10646         min = 0; max = 1;
10647         goto do_curly;
10648     }
10649   nest_check:
10650     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10651         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10652         ckWARN2reg(RExC_parse,
10653                    "%"UTF8f" matches null string many times",
10654                    UTF8fARG(UTF, (RExC_parse >= origparse
10655                                  ? RExC_parse - origparse
10656                                  : 0),
10657                    origparse));
10658         (void)ReREFCNT_inc(RExC_rx_sv);
10659     }
10660
10661     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10662         nextchar(pRExC_state);
10663         reginsert(pRExC_state, MINMOD, ret, depth+1);
10664         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10665     }
10666     else
10667     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10668         regnode *ender;
10669         nextchar(pRExC_state);
10670         ender = reg_node(pRExC_state, SUCCEED);
10671         REGTAIL(pRExC_state, ret, ender);
10672         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10673         ret->flags = 0;
10674         ender = reg_node(pRExC_state, TAIL);
10675         REGTAIL(pRExC_state, ret, ender);
10676     }
10677
10678     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10679         RExC_parse++;
10680         vFAIL("Nested quantifiers");
10681     }
10682
10683     return(ret);
10684 }
10685
10686 STATIC bool
10687 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10688                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10689                       const bool strict   /* Apply stricter parsing rules? */
10690     )
10691 {
10692
10693  /* This is expected to be called by a parser routine that has recognized '\N'
10694    and needs to handle the rest. RExC_parse is expected to point at the first
10695    char following the N at the time of the call.  On successful return,
10696    RExC_parse has been updated to point to just after the sequence identified
10697    by this routine, and <*flagp> has been updated.
10698
10699    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10700    character class.
10701
10702    \N may begin either a named sequence, or if outside a character class, mean
10703    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10704    attempted to decide which, and in the case of a named sequence, converted it
10705    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10706    where c1... are the characters in the sequence.  For single-quoted regexes,
10707    the tokenizer passes the \N sequence through unchanged; this code will not
10708    attempt to determine this nor expand those, instead raising a syntax error.
10709    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10710    or there is no '}', it signals that this \N occurrence means to match a
10711    non-newline.
10712
10713    Only the \N{U+...} form should occur in a character class, for the same
10714    reason that '.' inside a character class means to just match a period: it
10715    just doesn't make sense.
10716
10717    The function raises an error (via vFAIL), and doesn't return for various
10718    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10719    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10720    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10721    only possible if node_p is non-NULL.
10722
10723
10724    If <valuep> is non-null, it means the caller can accept an input sequence
10725    consisting of a just a single code point; <*valuep> is set to that value
10726    if the input is such.
10727
10728    If <node_p> is non-null it signifies that the caller can accept any other
10729    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10730    is set as follows:
10731     1) \N means not-a-NL: points to a newly created REG_ANY node;
10732     2) \N{}:              points to a new NOTHING node;
10733     3) otherwise:         points to a new EXACT node containing the resolved
10734                           string.
10735    Note that FALSE is returned for single code point sequences if <valuep> is
10736    null.
10737  */
10738
10739     char * endbrace;    /* '}' following the name */
10740     char* p;
10741     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10742                            stream */
10743     bool has_multiple_chars; /* true if the input stream contains a sequence of
10744                                 more than one character */
10745
10746     GET_RE_DEBUG_FLAGS_DECL;
10747
10748     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10749
10750     GET_RE_DEBUG_FLAGS;
10751
10752     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10753
10754     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10755      * modifier.  The other meaning does not, so use a temporary until we find
10756      * out which we are being called with */
10757     p = (RExC_flags & RXf_PMf_EXTENDED)
10758         ? regpatws(pRExC_state, RExC_parse,
10759                                 TRUE) /* means recognize comments */
10760         : RExC_parse;
10761
10762     /* Disambiguate between \N meaning a named character versus \N meaning
10763      * [^\n].  The former is assumed when it can't be the latter. */
10764     if (*p != '{' || regcurly(p)) {
10765         RExC_parse = p;
10766         if (! node_p) {
10767             /* no bare \N allowed in a charclass */
10768             if (in_char_class) {
10769                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10770             }
10771             return FALSE;
10772         }
10773         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10774                            current char */
10775         nextchar(pRExC_state);
10776         *node_p = reg_node(pRExC_state, REG_ANY);
10777         *flagp |= HASWIDTH|SIMPLE;
10778         RExC_naughty++;
10779         Set_Node_Length(*node_p, 1); /* MJD */
10780         return TRUE;
10781     }
10782
10783     /* Here, we have decided it should be a named character or sequence */
10784
10785     /* The test above made sure that the next real character is a '{', but
10786      * under the /x modifier, it could be separated by space (or a comment and
10787      * \n) and this is not allowed (for consistency with \x{...} and the
10788      * tokenizer handling of \N{NAME}). */
10789     if (*RExC_parse != '{') {
10790         vFAIL("Missing braces on \\N{}");
10791     }
10792
10793     RExC_parse++;       /* Skip past the '{' */
10794
10795     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10796         || ! (endbrace == RExC_parse            /* nothing between the {} */
10797               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10798                                                  */
10799                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10800                                                      */
10801     {
10802         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10803         vFAIL("\\N{NAME} must be resolved by the lexer");
10804     }
10805
10806     if (endbrace == RExC_parse) {   /* empty: \N{} */
10807         bool ret = TRUE;
10808         if (node_p) {
10809             *node_p = reg_node(pRExC_state,NOTHING);
10810         }
10811         else if (in_char_class) {
10812             if (SIZE_ONLY && in_char_class) {
10813                 if (strict) {
10814                     RExC_parse++;   /* Position after the "}" */
10815                     vFAIL("Zero length \\N{}");
10816                 }
10817                 else {
10818                     ckWARNreg(RExC_parse,
10819                               "Ignoring zero length \\N{} in character class");
10820                 }
10821             }
10822             ret = FALSE;
10823         }
10824         else {
10825             return FALSE;
10826         }
10827         nextchar(pRExC_state);
10828         return ret;
10829     }
10830
10831     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10832     RExC_parse += 2;    /* Skip past the 'U+' */
10833
10834     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10835
10836     /* Code points are separated by dots.  If none, there is only one code
10837      * point, and is terminated by the brace */
10838     has_multiple_chars = (endchar < endbrace);
10839
10840     if (valuep && (! has_multiple_chars || in_char_class)) {
10841         /* We only pay attention to the first char of
10842         multichar strings being returned in char classes. I kinda wonder
10843         if this makes sense as it does change the behaviour
10844         from earlier versions, OTOH that behaviour was broken
10845         as well. XXX Solution is to recharacterize as
10846         [rest-of-class]|multi1|multi2... */
10847
10848         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10849         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10850             | PERL_SCAN_DISALLOW_PREFIX
10851             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10852
10853         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10854
10855         /* The tokenizer should have guaranteed validity, but it's possible to
10856          * bypass it by using single quoting, so check */
10857         if (length_of_hex == 0
10858             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10859         {
10860             RExC_parse += length_of_hex;        /* Includes all the valid */
10861             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10862                             ? UTF8SKIP(RExC_parse)
10863                             : 1;
10864             /* Guard against malformed utf8 */
10865             if (RExC_parse >= endchar) {
10866                 RExC_parse = endchar;
10867             }
10868             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10869         }
10870
10871         if (in_char_class && has_multiple_chars) {
10872             if (strict) {
10873                 RExC_parse = endbrace;
10874                 vFAIL("\\N{} in character class restricted to one character");
10875             }
10876             else {
10877                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10878             }
10879         }
10880
10881         RExC_parse = endbrace + 1;
10882     }
10883     else if (! node_p || ! has_multiple_chars) {
10884
10885         /* Here, the input is legal, but not according to the caller's
10886          * options.  We fail without advancing the parse, so that the
10887          * caller can try again */
10888         RExC_parse = p;
10889         return FALSE;
10890     }
10891     else {
10892
10893         /* What is done here is to convert this to a sub-pattern of the form
10894          * (?:\x{char1}\x{char2}...)
10895          * and then call reg recursively.  That way, it retains its atomicness,
10896          * while not having to worry about special handling that some code
10897          * points may have.  toke.c has converted the original Unicode values
10898          * to native, so that we can just pass on the hex values unchanged.  We
10899          * do have to set a flag to keep recoding from happening in the
10900          * recursion */
10901
10902         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10903         STRLEN len;
10904         char *orig_end = RExC_end;
10905         I32 flags;
10906
10907         while (RExC_parse < endbrace) {
10908
10909             /* Convert to notation the rest of the code understands */
10910             sv_catpv(substitute_parse, "\\x{");
10911             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10912             sv_catpv(substitute_parse, "}");
10913
10914             /* Point to the beginning of the next character in the sequence. */
10915             RExC_parse = endchar + 1;
10916             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10917         }
10918         sv_catpv(substitute_parse, ")");
10919
10920         RExC_parse = SvPV(substitute_parse, len);
10921
10922         /* Don't allow empty number */
10923         if (len < 8) {
10924             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10925         }
10926         RExC_end = RExC_parse + len;
10927
10928         /* The values are Unicode, and therefore not subject to recoding */
10929         RExC_override_recoding = 1;
10930
10931         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10932             if (flags & RESTART_UTF8) {
10933                 *flagp = RESTART_UTF8;
10934                 return FALSE;
10935             }
10936             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10937                   (UV) flags);
10938         }
10939         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10940
10941         RExC_parse = endbrace;
10942         RExC_end = orig_end;
10943         RExC_override_recoding = 0;
10944
10945         nextchar(pRExC_state);
10946     }
10947
10948     return TRUE;
10949 }
10950
10951
10952 /*
10953  * reg_recode
10954  *
10955  * It returns the code point in utf8 for the value in *encp.
10956  *    value: a code value in the source encoding
10957  *    encp:  a pointer to an Encode object
10958  *
10959  * If the result from Encode is not a single character,
10960  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10961  */
10962 STATIC UV
10963 S_reg_recode(pTHX_ const char value, SV **encp)
10964 {
10965     STRLEN numlen = 1;
10966     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10967     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10968     const STRLEN newlen = SvCUR(sv);
10969     UV uv = UNICODE_REPLACEMENT;
10970
10971     PERL_ARGS_ASSERT_REG_RECODE;
10972
10973     if (newlen)
10974         uv = SvUTF8(sv)
10975              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10976              : *(U8*)s;
10977
10978     if (!newlen || numlen != newlen) {
10979         uv = UNICODE_REPLACEMENT;
10980         *encp = NULL;
10981     }
10982     return uv;
10983 }
10984
10985 PERL_STATIC_INLINE U8
10986 S_compute_EXACTish(RExC_state_t *pRExC_state)
10987 {
10988     U8 op;
10989
10990     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10991
10992     if (! FOLD) {
10993         return EXACT;
10994     }
10995
10996     op = get_regex_charset(RExC_flags);
10997     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10998         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10999                  been, so there is no hole */
11000     }
11001
11002     return op + EXACTF;
11003 }
11004
11005 PERL_STATIC_INLINE void
11006 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11007                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11008                          bool downgradable)
11009 {
11010     /* This knows the details about sizing an EXACTish node, setting flags for
11011      * it (by setting <*flagp>, and potentially populating it with a single
11012      * character.
11013      *
11014      * If <len> (the length in bytes) is non-zero, this function assumes that
11015      * the node has already been populated, and just does the sizing.  In this
11016      * case <code_point> should be the final code point that has already been
11017      * placed into the node.  This value will be ignored except that under some
11018      * circumstances <*flagp> is set based on it.
11019      *
11020      * If <len> is zero, the function assumes that the node is to contain only
11021      * the single character given by <code_point> and calculates what <len>
11022      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11023      * additionally will populate the node's STRING with <code_point> or its
11024      * fold if folding.
11025      *
11026      * In both cases <*flagp> is appropriately set
11027      *
11028      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11029      * 255, must be folded (the former only when the rules indicate it can
11030      * match 'ss')
11031      *
11032      * When it does the populating, it looks at the flag 'downgradable'.  If
11033      * true with a node that folds, it checks if the single code point
11034      * participates in a fold, and if not downgrades the node to an EXACT.
11035      * This helps the optimizer */
11036
11037     bool len_passed_in = cBOOL(len != 0);
11038     U8 character[UTF8_MAXBYTES_CASE+1];
11039
11040     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11041
11042     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11043      * sizing difference, and is extra work that is thrown away */
11044     if (downgradable && ! PASS2) {
11045         downgradable = FALSE;
11046     }
11047
11048     if (! len_passed_in) {
11049         if (UTF) {
11050             if (UNI_IS_INVARIANT(code_point)) {
11051                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11052                     *character = (U8) code_point;
11053                 }
11054                 else { /* Here is /i and not /l (toFOLD() is defined on just
11055                           ASCII, which isn't the same thing as INVARIANT on
11056                           EBCDIC, but it works there, as the extra invariants
11057                           fold to themselves) */
11058                     *character = toFOLD((U8) code_point);
11059                     if (downgradable
11060                         && *character == code_point
11061                         && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11062                     {
11063                         OP(node) = EXACT;
11064                     }
11065                 }
11066                 len = 1;
11067             }
11068             else if (FOLD && (! LOC
11069                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11070             {   /* Folding, and ok to do so now */
11071                 UV folded = _to_uni_fold_flags(
11072                                    code_point,
11073                                    character,
11074                                    &len,
11075                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11076                                                       ? FOLD_FLAGS_NOMIX_ASCII
11077                                                       : 0));
11078                 if (downgradable
11079                     && folded == code_point
11080                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11081                 {
11082                     OP(node) = EXACT;
11083                 }
11084             }
11085             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11086
11087                 /* Not folding this cp, and can output it directly */
11088                 *character = UTF8_TWO_BYTE_HI(code_point);
11089                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11090                 len = 2;
11091             }
11092             else {
11093                 uvchr_to_utf8( character, code_point);
11094                 len = UTF8SKIP(character);
11095             }
11096         } /* Else pattern isn't UTF8.  */
11097         else if (! FOLD) {
11098             *character = (U8) code_point;
11099             len = 1;
11100         } /* Else is folded non-UTF8 */
11101         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11102
11103             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11104              * comments at join_exact()); */
11105             *character = (U8) code_point;
11106             len = 1;
11107
11108             /* Can turn into an EXACT node if we know the fold at compile time,
11109              * and it folds to itself and doesn't particpate in other folds */
11110             if (downgradable
11111                 && ! LOC
11112                 && PL_fold_latin1[code_point] == code_point
11113                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11114                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11115             {
11116                 OP(node) = EXACT;
11117             }
11118         } /* else is Sharp s.  May need to fold it */
11119         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11120             *character = 's';
11121             *(character + 1) = 's';
11122             len = 2;
11123         }
11124         else {
11125             *character = LATIN_SMALL_LETTER_SHARP_S;
11126             len = 1;
11127         }
11128     }
11129
11130     if (SIZE_ONLY) {
11131         RExC_size += STR_SZ(len);
11132     }
11133     else {
11134         RExC_emit += STR_SZ(len);
11135         STR_LEN(node) = len;
11136         if (! len_passed_in) {
11137             Copy((char *) character, STRING(node), len, char);
11138         }
11139     }
11140
11141     *flagp |= HASWIDTH;
11142
11143     /* A single character node is SIMPLE, except for the special-cased SHARP S
11144      * under /di. */
11145     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11146         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11147             || ! FOLD || ! DEPENDS_SEMANTICS))
11148     {
11149         *flagp |= SIMPLE;
11150     }
11151
11152     /* The OP may not be well defined in PASS1 */
11153     if (PASS2 && OP(node) == EXACTFL) {
11154         RExC_contains_locale = 1;
11155     }
11156 }
11157
11158
11159 /* return atoi(p), unless it's too big to sensibly be a backref,
11160  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11161
11162 static I32
11163 S_backref_value(char *p)
11164 {
11165     char *q = p;
11166
11167     for (;isDIGIT(*q); q++) {} /* calculate length of num */
11168     if (q - p == 0 || q - p > 9)
11169         return I32_MAX;
11170     return atoi(p);
11171 }
11172
11173
11174 /*
11175  - regatom - the lowest level
11176
11177    Try to identify anything special at the start of the pattern. If there
11178    is, then handle it as required. This may involve generating a single regop,
11179    such as for an assertion; or it may involve recursing, such as to
11180    handle a () structure.
11181
11182    If the string doesn't start with something special then we gobble up
11183    as much literal text as we can.
11184
11185    Once we have been able to handle whatever type of thing started the
11186    sequence, we return.
11187
11188    Note: we have to be careful with escapes, as they can be both literal
11189    and special, and in the case of \10 and friends, context determines which.
11190
11191    A summary of the code structure is:
11192
11193    switch (first_byte) {
11194         cases for each special:
11195             handle this special;
11196             break;
11197         case '\\':
11198             switch (2nd byte) {
11199                 cases for each unambiguous special:
11200                     handle this special;
11201                     break;
11202                 cases for each ambigous special/literal:
11203                     disambiguate;
11204                     if (special)  handle here
11205                     else goto defchar;
11206                 default: // unambiguously literal:
11207                     goto defchar;
11208             }
11209         default:  // is a literal char
11210             // FALL THROUGH
11211         defchar:
11212             create EXACTish node for literal;
11213             while (more input and node isn't full) {
11214                 switch (input_byte) {
11215                    cases for each special;
11216                        make sure parse pointer is set so that the next call to
11217                            regatom will see this special first
11218                        goto loopdone; // EXACTish node terminated by prev. char
11219                    default:
11220                        append char to EXACTISH node;
11221                 }
11222                 get next input byte;
11223             }
11224         loopdone:
11225    }
11226    return the generated node;
11227
11228    Specifically there are two separate switches for handling
11229    escape sequences, with the one for handling literal escapes requiring
11230    a dummy entry for all of the special escapes that are actually handled
11231    by the other.
11232
11233    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11234    TRYAGAIN.
11235    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11236    restarted.
11237    Otherwise does not return NULL.
11238 */
11239
11240 STATIC regnode *
11241 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11242 {
11243     dVAR;
11244     regnode *ret = NULL;
11245     I32 flags = 0;
11246     char *parse_start = RExC_parse;
11247     U8 op;
11248     int invert = 0;
11249     U8 arg;
11250
11251     GET_RE_DEBUG_FLAGS_DECL;
11252
11253     *flagp = WORST;             /* Tentatively. */
11254
11255     DEBUG_PARSE("atom");
11256
11257     PERL_ARGS_ASSERT_REGATOM;
11258
11259 tryagain:
11260     switch ((U8)*RExC_parse) {
11261     case '^':
11262         RExC_seen_zerolen++;
11263         nextchar(pRExC_state);
11264         if (RExC_flags & RXf_PMf_MULTILINE)
11265             ret = reg_node(pRExC_state, MBOL);
11266         else if (RExC_flags & RXf_PMf_SINGLELINE)
11267             ret = reg_node(pRExC_state, SBOL);
11268         else
11269             ret = reg_node(pRExC_state, BOL);
11270         Set_Node_Length(ret, 1); /* MJD */
11271         break;
11272     case '$':
11273         nextchar(pRExC_state);
11274         if (*RExC_parse)
11275             RExC_seen_zerolen++;
11276         if (RExC_flags & RXf_PMf_MULTILINE)
11277             ret = reg_node(pRExC_state, MEOL);
11278         else if (RExC_flags & RXf_PMf_SINGLELINE)
11279             ret = reg_node(pRExC_state, SEOL);
11280         else
11281             ret = reg_node(pRExC_state, EOL);
11282         Set_Node_Length(ret, 1); /* MJD */
11283         break;
11284     case '.':
11285         nextchar(pRExC_state);
11286         if (RExC_flags & RXf_PMf_SINGLELINE)
11287             ret = reg_node(pRExC_state, SANY);
11288         else
11289             ret = reg_node(pRExC_state, REG_ANY);
11290         *flagp |= HASWIDTH|SIMPLE;
11291         RExC_naughty++;
11292         Set_Node_Length(ret, 1); /* MJD */
11293         break;
11294     case '[':
11295     {
11296         char * const oregcomp_parse = ++RExC_parse;
11297         ret = regclass(pRExC_state, flagp,depth+1,
11298                        FALSE, /* means parse the whole char class */
11299                        TRUE, /* allow multi-char folds */
11300                        FALSE, /* don't silence non-portable warnings. */
11301                        NULL);
11302         if (*RExC_parse != ']') {
11303             RExC_parse = oregcomp_parse;
11304             vFAIL("Unmatched [");
11305         }
11306         if (ret == NULL) {
11307             if (*flagp & RESTART_UTF8)
11308                 return NULL;
11309             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11310                   (UV) *flagp);
11311         }
11312         nextchar(pRExC_state);
11313         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11314         break;
11315     }
11316     case '(':
11317         nextchar(pRExC_state);
11318         ret = reg(pRExC_state, 2, &flags,depth+1);
11319         if (ret == NULL) {
11320                 if (flags & TRYAGAIN) {
11321                     if (RExC_parse == RExC_end) {
11322                          /* Make parent create an empty node if needed. */
11323                         *flagp |= TRYAGAIN;
11324                         return(NULL);
11325                     }
11326                     goto tryagain;
11327                 }
11328                 if (flags & RESTART_UTF8) {
11329                     *flagp = RESTART_UTF8;
11330                     return NULL;
11331                 }
11332                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11333                                                                  (UV) flags);
11334         }
11335         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11336         break;
11337     case '|':
11338     case ')':
11339         if (flags & TRYAGAIN) {
11340             *flagp |= TRYAGAIN;
11341             return NULL;
11342         }
11343         vFAIL("Internal urp");
11344                                 /* Supposed to be caught earlier. */
11345         break;
11346     case '?':
11347     case '+':
11348     case '*':
11349         RExC_parse++;
11350         vFAIL("Quantifier follows nothing");
11351         break;
11352     case '\\':
11353         /* Special Escapes
11354
11355            This switch handles escape sequences that resolve to some kind
11356            of special regop and not to literal text. Escape sequnces that
11357            resolve to literal text are handled below in the switch marked
11358            "Literal Escapes".
11359
11360            Every entry in this switch *must* have a corresponding entry
11361            in the literal escape switch. However, the opposite is not
11362            required, as the default for this switch is to jump to the
11363            literal text handling code.
11364         */
11365         switch ((U8)*++RExC_parse) {
11366         /* Special Escapes */
11367         case 'A':
11368             RExC_seen_zerolen++;
11369             ret = reg_node(pRExC_state, SBOL);
11370             *flagp |= SIMPLE;
11371             goto finish_meta_pat;
11372         case 'G':
11373             ret = reg_node(pRExC_state, GPOS);
11374             RExC_seen |= REG_GPOS_SEEN;
11375             *flagp |= SIMPLE;
11376             goto finish_meta_pat;
11377         case 'K':
11378             RExC_seen_zerolen++;
11379             ret = reg_node(pRExC_state, KEEPS);
11380             *flagp |= SIMPLE;
11381             /* XXX:dmq : disabling in-place substitution seems to
11382              * be necessary here to avoid cases of memory corruption, as
11383              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11384              */
11385             RExC_seen |= REG_LOOKBEHIND_SEEN;
11386             goto finish_meta_pat;
11387         case 'Z':
11388             ret = reg_node(pRExC_state, SEOL);
11389             *flagp |= SIMPLE;
11390             RExC_seen_zerolen++;                /* Do not optimize RE away */
11391             goto finish_meta_pat;
11392         case 'z':
11393             ret = reg_node(pRExC_state, EOS);
11394             *flagp |= SIMPLE;
11395             RExC_seen_zerolen++;                /* Do not optimize RE away */
11396             goto finish_meta_pat;
11397         case 'C':
11398             ret = reg_node(pRExC_state, CANY);
11399             RExC_seen |= REG_CANY_SEEN;
11400             *flagp |= HASWIDTH|SIMPLE;
11401             goto finish_meta_pat;
11402         case 'X':
11403             ret = reg_node(pRExC_state, CLUMP);
11404             *flagp |= HASWIDTH;
11405             goto finish_meta_pat;
11406
11407         case 'W':
11408             invert = 1;
11409             /* FALLTHROUGH */
11410         case 'w':
11411             arg = ANYOF_WORDCHAR;
11412             goto join_posix;
11413
11414         case 'b':
11415             RExC_seen_zerolen++;
11416             RExC_seen |= REG_LOOKBEHIND_SEEN;
11417             op = BOUND + get_regex_charset(RExC_flags);
11418             if (op > BOUNDA) {  /* /aa is same as /a */
11419                 op = BOUNDA;
11420             }
11421             else if (op == BOUNDL) {
11422                 RExC_contains_locale = 1;
11423             }
11424             ret = reg_node(pRExC_state, op);
11425             FLAGS(ret) = get_regex_charset(RExC_flags);
11426             *flagp |= SIMPLE;
11427             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11428                 /* diag_listed_as: Use "%s" instead of "%s" */
11429                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11430             }
11431             goto finish_meta_pat;
11432         case 'B':
11433             RExC_seen_zerolen++;
11434             RExC_seen |= REG_LOOKBEHIND_SEEN;
11435             op = NBOUND + get_regex_charset(RExC_flags);
11436             if (op > NBOUNDA) { /* /aa is same as /a */
11437                 op = NBOUNDA;
11438             }
11439             else if (op == NBOUNDL) {
11440                 RExC_contains_locale = 1;
11441             }
11442             ret = reg_node(pRExC_state, op);
11443             FLAGS(ret) = get_regex_charset(RExC_flags);
11444             *flagp |= SIMPLE;
11445             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11446                 /* diag_listed_as: Use "%s" instead of "%s" */
11447                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11448             }
11449             goto finish_meta_pat;
11450
11451         case 'D':
11452             invert = 1;
11453             /* FALLTHROUGH */
11454         case 'd':
11455             arg = ANYOF_DIGIT;
11456             goto join_posix;
11457
11458         case 'R':
11459             ret = reg_node(pRExC_state, LNBREAK);
11460             *flagp |= HASWIDTH|SIMPLE;
11461             goto finish_meta_pat;
11462
11463         case 'H':
11464             invert = 1;
11465             /* FALLTHROUGH */
11466         case 'h':
11467             arg = ANYOF_BLANK;
11468             op = POSIXU;
11469             goto join_posix_op_known;
11470
11471         case 'V':
11472             invert = 1;
11473             /* FALLTHROUGH */
11474         case 'v':
11475             arg = ANYOF_VERTWS;
11476             op = POSIXU;
11477             goto join_posix_op_known;
11478
11479         case 'S':
11480             invert = 1;
11481             /* FALLTHROUGH */
11482         case 's':
11483             arg = ANYOF_SPACE;
11484
11485         join_posix:
11486
11487             op = POSIXD + get_regex_charset(RExC_flags);
11488             if (op > POSIXA) {  /* /aa is same as /a */
11489                 op = POSIXA;
11490             }
11491             else if (op == POSIXL) {
11492                 RExC_contains_locale = 1;
11493             }
11494
11495         join_posix_op_known:
11496
11497             if (invert) {
11498                 op += NPOSIXD - POSIXD;
11499             }
11500
11501             ret = reg_node(pRExC_state, op);
11502             if (! SIZE_ONLY) {
11503                 FLAGS(ret) = namedclass_to_classnum(arg);
11504             }
11505
11506             *flagp |= HASWIDTH|SIMPLE;
11507             /* FALLTHROUGH */
11508
11509          finish_meta_pat:
11510             nextchar(pRExC_state);
11511             Set_Node_Length(ret, 2); /* MJD */
11512             break;
11513         case 'p':
11514         case 'P':
11515             {
11516 #ifdef DEBUGGING
11517                 char* parse_start = RExC_parse - 2;
11518 #endif
11519
11520                 RExC_parse--;
11521
11522                 ret = regclass(pRExC_state, flagp,depth+1,
11523                                TRUE, /* means just parse this element */
11524                                FALSE, /* don't allow multi-char folds */
11525                                FALSE, /* don't silence non-portable warnings.
11526                                          It would be a bug if these returned
11527                                          non-portables */
11528                                NULL);
11529                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11530                    are allowed.  */
11531                 if (!ret)
11532                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11533                           (UV) *flagp);
11534
11535                 RExC_parse--;
11536
11537                 Set_Node_Offset(ret, parse_start + 2);
11538                 Set_Node_Cur_Length(ret, parse_start);
11539                 nextchar(pRExC_state);
11540             }
11541             break;
11542         case 'N':
11543             /* Handle \N and \N{NAME} with multiple code points here and not
11544              * below because it can be multicharacter. join_exact() will join
11545              * them up later on.  Also this makes sure that things like
11546              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11547              * The options to the grok function call causes it to fail if the
11548              * sequence is just a single code point.  We then go treat it as
11549              * just another character in the current EXACT node, and hence it
11550              * gets uniform treatment with all the other characters.  The
11551              * special treatment for quantifiers is not needed for such single
11552              * character sequences */
11553             ++RExC_parse;
11554             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11555                                 FALSE /* not strict */ )) {
11556                 if (*flagp & RESTART_UTF8)
11557                     return NULL;
11558                 RExC_parse--;
11559                 goto defchar;
11560             }
11561             break;
11562         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11563         parse_named_seq:
11564         {
11565             char ch= RExC_parse[1];
11566             if (ch != '<' && ch != '\'' && ch != '{') {
11567                 RExC_parse++;
11568                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11569                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11570             } else {
11571                 /* this pretty much dupes the code for (?P=...) in reg(), if
11572                    you change this make sure you change that */
11573                 char* name_start = (RExC_parse += 2);
11574                 U32 num = 0;
11575                 SV *sv_dat = reg_scan_name(pRExC_state,
11576                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11577                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11578                 if (RExC_parse == name_start || *RExC_parse != ch)
11579                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11580                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11581
11582                 if (!SIZE_ONLY) {
11583                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11584                     RExC_rxi->data->data[num]=(void*)sv_dat;
11585                     SvREFCNT_inc_simple_void(sv_dat);
11586                 }
11587
11588                 RExC_sawback = 1;
11589                 ret = reganode(pRExC_state,
11590                                ((! FOLD)
11591                                  ? NREF
11592                                  : (ASCII_FOLD_RESTRICTED)
11593                                    ? NREFFA
11594                                    : (AT_LEAST_UNI_SEMANTICS)
11595                                      ? NREFFU
11596                                      : (LOC)
11597                                        ? NREFFL
11598                                        : NREFF),
11599                                 num);
11600                 *flagp |= HASWIDTH;
11601
11602                 /* override incorrect value set in reganode MJD */
11603                 Set_Node_Offset(ret, parse_start+1);
11604                 Set_Node_Cur_Length(ret, parse_start);
11605                 nextchar(pRExC_state);
11606
11607             }
11608             break;
11609         }
11610         case 'g':
11611         case '1': case '2': case '3': case '4':
11612         case '5': case '6': case '7': case '8': case '9':
11613             {
11614                 I32 num;
11615                 bool hasbrace = 0;
11616
11617                 if (*RExC_parse == 'g') {
11618                     bool isrel = 0;
11619
11620                     RExC_parse++;
11621                     if (*RExC_parse == '{') {
11622                         RExC_parse++;
11623                         hasbrace = 1;
11624                     }
11625                     if (*RExC_parse == '-') {
11626                         RExC_parse++;
11627                         isrel = 1;
11628                     }
11629                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11630                         if (isrel) RExC_parse--;
11631                         RExC_parse -= 2;
11632                         goto parse_named_seq;
11633                     }
11634
11635                     num = S_backref_value(RExC_parse);
11636                     if (num == 0)
11637                         vFAIL("Reference to invalid group 0");
11638                     else if (num == I32_MAX) {
11639                          if (isDIGIT(*RExC_parse))
11640                             vFAIL("Reference to nonexistent group");
11641                         else
11642                             vFAIL("Unterminated \\g... pattern");
11643                     }
11644
11645                     if (isrel) {
11646                         num = RExC_npar - num;
11647                         if (num < 1)
11648                             vFAIL("Reference to nonexistent or unclosed group");
11649                     }
11650                 }
11651                 else {
11652                     num = S_backref_value(RExC_parse);
11653                     /* bare \NNN might be backref or octal - if it is larger than or equal
11654                      * RExC_npar then it is assumed to be and octal escape.
11655                      * Note RExC_npar is +1 from the actual number of parens*/
11656                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11657                             && *RExC_parse != '8' && *RExC_parse != '9'))
11658                     {
11659                         /* Probably a character specified in octal, e.g. \35 */
11660                         goto defchar;
11661                     }
11662                 }
11663
11664                 /* at this point RExC_parse definitely points to a backref
11665                  * number */
11666                 {
11667 #ifdef RE_TRACK_PATTERN_OFFSETS
11668                     char * const parse_start = RExC_parse - 1; /* MJD */
11669 #endif
11670                     while (isDIGIT(*RExC_parse))
11671                         RExC_parse++;
11672                     if (hasbrace) {
11673                         if (*RExC_parse != '}')
11674                             vFAIL("Unterminated \\g{...} pattern");
11675                         RExC_parse++;
11676                     }
11677                     if (!SIZE_ONLY) {
11678                         if (num > (I32)RExC_rx->nparens)
11679                             vFAIL("Reference to nonexistent group");
11680                     }
11681                     RExC_sawback = 1;
11682                     ret = reganode(pRExC_state,
11683                                    ((! FOLD)
11684                                      ? REF
11685                                      : (ASCII_FOLD_RESTRICTED)
11686                                        ? REFFA
11687                                        : (AT_LEAST_UNI_SEMANTICS)
11688                                          ? REFFU
11689                                          : (LOC)
11690                                            ? REFFL
11691                                            : REFF),
11692                                     num);
11693                     *flagp |= HASWIDTH;
11694
11695                     /* override incorrect value set in reganode MJD */
11696                     Set_Node_Offset(ret, parse_start+1);
11697                     Set_Node_Cur_Length(ret, parse_start);
11698                     RExC_parse--;
11699                     nextchar(pRExC_state);
11700                 }
11701             }
11702             break;
11703         case '\0':
11704             if (RExC_parse >= RExC_end)
11705                 FAIL("Trailing \\");
11706             /* FALLTHROUGH */
11707         default:
11708             /* Do not generate "unrecognized" warnings here, we fall
11709                back into the quick-grab loop below */
11710             parse_start--;
11711             goto defchar;
11712         }
11713         break;
11714
11715     case '#':
11716         if (RExC_flags & RXf_PMf_EXTENDED) {
11717             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11718             if (RExC_parse < RExC_end)
11719                 goto tryagain;
11720         }
11721         /* FALLTHROUGH */
11722
11723     default:
11724
11725             parse_start = RExC_parse - 1;
11726
11727             RExC_parse++;
11728
11729         defchar: {
11730             STRLEN len = 0;
11731             UV ender = 0;
11732             char *p;
11733             char *s;
11734 #define MAX_NODE_STRING_SIZE 127
11735             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11736             char *s0;
11737             U8 upper_parse = MAX_NODE_STRING_SIZE;
11738             U8 node_type = compute_EXACTish(pRExC_state);
11739             bool next_is_quantifier;
11740             char * oldp = NULL;
11741
11742             /* We can convert EXACTF nodes to EXACTFU if they contain only
11743              * characters that match identically regardless of the target
11744              * string's UTF8ness.  The reason to do this is that EXACTF is not
11745              * trie-able, EXACTFU is.
11746              *
11747              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11748              * contain only above-Latin1 characters (hence must be in UTF8),
11749              * which don't participate in folds with Latin1-range characters,
11750              * as the latter's folds aren't known until runtime.  (We don't
11751              * need to figure this out until pass 2) */
11752             bool maybe_exactfu = PASS2
11753                                && (node_type == EXACTF || node_type == EXACTFL);
11754
11755             /* If a folding node contains only code points that don't
11756              * participate in folds, it can be changed into an EXACT node,
11757              * which allows the optimizer more things to look for */
11758             bool maybe_exact;
11759
11760             ret = reg_node(pRExC_state, node_type);
11761
11762             /* In pass1, folded, we use a temporary buffer instead of the
11763              * actual node, as the node doesn't exist yet */
11764             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11765
11766             s0 = s;
11767
11768         reparse:
11769
11770             /* We do the EXACTFish to EXACT node only if folding.  (And we
11771              * don't need to figure this out until pass 2) */
11772             maybe_exact = FOLD && PASS2;
11773
11774             /* XXX The node can hold up to 255 bytes, yet this only goes to
11775              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11776              * 255 allows us to not have to worry about overflow due to
11777              * converting to utf8 and fold expansion, but that value is
11778              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11779              * split up by this limit into a single one using the real max of
11780              * 255.  Even at 127, this breaks under rare circumstances.  If
11781              * folding, we do not want to split a node at a character that is a
11782              * non-final in a multi-char fold, as an input string could just
11783              * happen to want to match across the node boundary.  The join
11784              * would solve that problem if the join actually happens.  But a
11785              * series of more than two nodes in a row each of 127 would cause
11786              * the first join to succeed to get to 254, but then there wouldn't
11787              * be room for the next one, which could at be one of those split
11788              * multi-char folds.  I don't know of any fool-proof solution.  One
11789              * could back off to end with only a code point that isn't such a
11790              * non-final, but it is possible for there not to be any in the
11791              * entire node. */
11792             for (p = RExC_parse - 1;
11793                  len < upper_parse && p < RExC_end;
11794                  len++)
11795             {
11796                 oldp = p;
11797
11798                 if (RExC_flags & RXf_PMf_EXTENDED)
11799                     p = regpatws(pRExC_state, p,
11800                                           TRUE); /* means recognize comments */
11801                 switch ((U8)*p) {
11802                 case '^':
11803                 case '$':
11804                 case '.':
11805                 case '[':
11806                 case '(':
11807                 case ')':
11808                 case '|':
11809                     goto loopdone;
11810                 case '\\':
11811                     /* Literal Escapes Switch
11812
11813                        This switch is meant to handle escape sequences that
11814                        resolve to a literal character.
11815
11816                        Every escape sequence that represents something
11817                        else, like an assertion or a char class, is handled
11818                        in the switch marked 'Special Escapes' above in this
11819                        routine, but also has an entry here as anything that
11820                        isn't explicitly mentioned here will be treated as
11821                        an unescaped equivalent literal.
11822                     */
11823
11824                     switch ((U8)*++p) {
11825                     /* These are all the special escapes. */
11826                     case 'A':             /* Start assertion */
11827                     case 'b': case 'B':   /* Word-boundary assertion*/
11828                     case 'C':             /* Single char !DANGEROUS! */
11829                     case 'd': case 'D':   /* digit class */
11830                     case 'g': case 'G':   /* generic-backref, pos assertion */
11831                     case 'h': case 'H':   /* HORIZWS */
11832                     case 'k': case 'K':   /* named backref, keep marker */
11833                     case 'p': case 'P':   /* Unicode property */
11834                               case 'R':   /* LNBREAK */
11835                     case 's': case 'S':   /* space class */
11836                     case 'v': case 'V':   /* VERTWS */
11837                     case 'w': case 'W':   /* word class */
11838                     case 'X':             /* eXtended Unicode "combining
11839                                              character sequence" */
11840                     case 'z': case 'Z':   /* End of line/string assertion */
11841                         --p;
11842                         goto loopdone;
11843
11844                     /* Anything after here is an escape that resolves to a
11845                        literal. (Except digits, which may or may not)
11846                      */
11847                     case 'n':
11848                         ender = '\n';
11849                         p++;
11850                         break;
11851                     case 'N': /* Handle a single-code point named character. */
11852                         /* The options cause it to fail if a multiple code
11853                          * point sequence.  Handle those in the switch() above
11854                          * */
11855                         RExC_parse = p + 1;
11856                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11857                                             flagp, depth, FALSE,
11858                                             FALSE /* not strict */ ))
11859                         {
11860                             if (*flagp & RESTART_UTF8)
11861                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11862                             RExC_parse = p = oldp;
11863                             goto loopdone;
11864                         }
11865                         p = RExC_parse;
11866                         if (ender > 0xff) {
11867                             REQUIRE_UTF8;
11868                         }
11869                         break;
11870                     case 'r':
11871                         ender = '\r';
11872                         p++;
11873                         break;
11874                     case 't':
11875                         ender = '\t';
11876                         p++;
11877                         break;
11878                     case 'f':
11879                         ender = '\f';
11880                         p++;
11881                         break;
11882                     case 'e':
11883                           ender = ASCII_TO_NATIVE('\033');
11884                         p++;
11885                         break;
11886                     case 'a':
11887                           ender = '\a';
11888                         p++;
11889                         break;
11890                     case 'o':
11891                         {
11892                             UV result;
11893                             const char* error_msg;
11894
11895                             bool valid = grok_bslash_o(&p,
11896                                                        &result,
11897                                                        &error_msg,
11898                                                        TRUE, /* out warnings */
11899                                                        FALSE, /* not strict */
11900                                                        TRUE, /* Output warnings
11901                                                                 for non-
11902                                                                 portables */
11903                                                        UTF);
11904                             if (! valid) {
11905                                 RExC_parse = p; /* going to die anyway; point
11906                                                    to exact spot of failure */
11907                                 vFAIL(error_msg);
11908                             }
11909                             ender = result;
11910                             if (PL_encoding && ender < 0x100) {
11911                                 goto recode_encoding;
11912                             }
11913                             if (ender > 0xff) {
11914                                 REQUIRE_UTF8;
11915                             }
11916                             break;
11917                         }
11918                     case 'x':
11919                         {
11920                             UV result = UV_MAX; /* initialize to erroneous
11921                                                    value */
11922                             const char* error_msg;
11923
11924                             bool valid = grok_bslash_x(&p,
11925                                                        &result,
11926                                                        &error_msg,
11927                                                        TRUE, /* out warnings */
11928                                                        FALSE, /* not strict */
11929                                                        TRUE, /* Output warnings
11930                                                                 for non-
11931                                                                 portables */
11932                                                        UTF);
11933                             if (! valid) {
11934                                 RExC_parse = p; /* going to die anyway; point
11935                                                    to exact spot of failure */
11936                                 vFAIL(error_msg);
11937                             }
11938                             ender = result;
11939
11940                             if (PL_encoding && ender < 0x100) {
11941                                 goto recode_encoding;
11942                             }
11943                             if (ender > 0xff) {
11944                                 REQUIRE_UTF8;
11945                             }
11946                             break;
11947                         }
11948                     case 'c':
11949                         p++;
11950                         ender = grok_bslash_c(*p++, SIZE_ONLY);
11951                         break;
11952                     case '8': case '9': /* must be a backreference */
11953                         --p;
11954                         goto loopdone;
11955                     case '1': case '2': case '3':case '4':
11956                     case '5': case '6': case '7':
11957                         /* When we parse backslash escapes there is ambiguity
11958                          * between backreferences and octal escapes. Any escape
11959                          * from \1 - \9 is a backreference, any multi-digit
11960                          * escape which does not start with 0 and which when
11961                          * evaluated as decimal could refer to an already
11962                          * parsed capture buffer is a backslash. Anything else
11963                          * is octal.
11964                          *
11965                          * Note this implies that \118 could be interpreted as
11966                          * 118 OR as "\11" . "8" depending on whether there
11967                          * were 118 capture buffers defined already in the
11968                          * pattern.  */
11969
11970                         /* NOTE, RExC_npar is 1 more than the actual number of
11971                          * parens we have seen so far, hence the < RExC_npar below. */
11972
11973                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11974                         {  /* Not to be treated as an octal constant, go
11975                                    find backref */
11976                             --p;
11977                             goto loopdone;
11978                         }
11979                         /* FALLTHROUGH */
11980                     case '0':
11981                         {
11982                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11983                             STRLEN numlen = 3;
11984                             ender = grok_oct(p, &numlen, &flags, NULL);
11985                             if (ender > 0xff) {
11986                                 REQUIRE_UTF8;
11987                             }
11988                             p += numlen;
11989                             if (SIZE_ONLY   /* like \08, \178 */
11990                                 && numlen < 3
11991                                 && p < RExC_end
11992                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11993                             {
11994                                 reg_warn_non_literal_string(
11995                                          p + 1,
11996                                          form_short_octal_warning(p, numlen));
11997                             }
11998                         }
11999                         if (PL_encoding && ender < 0x100)
12000                             goto recode_encoding;
12001                         break;
12002                     recode_encoding:
12003                         if (! RExC_override_recoding) {
12004                             SV* enc = PL_encoding;
12005                             ender = reg_recode((const char)(U8)ender, &enc);
12006                             if (!enc && SIZE_ONLY)
12007                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12008                             REQUIRE_UTF8;
12009                         }
12010                         break;
12011                     case '\0':
12012                         if (p >= RExC_end)
12013                             FAIL("Trailing \\");
12014                         /* FALLTHROUGH */
12015                     default:
12016                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12017                             /* Include any { following the alpha to emphasize
12018                              * that it could be part of an escape at some point
12019                              * in the future */
12020                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12021                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12022                         }
12023                         goto normal_default;
12024                     } /* End of switch on '\' */
12025                     break;
12026                 case '{':
12027                     /* Currently we don't warn when the lbrace is at the start
12028                      * of a construct.  This catches it in the middle of a
12029                      * literal string, or when its the first thing after
12030                      * something like "\b" */
12031                     if (! SIZE_ONLY
12032                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12033                     {
12034                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12035                     }
12036                     /*FALLTHROUGH*/
12037                 default:    /* A literal character */
12038                   normal_default:
12039                     if (UTF8_IS_START(*p) && UTF) {
12040                         STRLEN numlen;
12041                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12042                                                &numlen, UTF8_ALLOW_DEFAULT);
12043                         p += numlen;
12044                     }
12045                     else
12046                         ender = (U8) *p++;
12047                     break;
12048                 } /* End of switch on the literal */
12049
12050                 /* Here, have looked at the literal character and <ender>
12051                  * contains its ordinal, <p> points to the character after it
12052                  */
12053
12054                 if ( RExC_flags & RXf_PMf_EXTENDED)
12055                     p = regpatws(pRExC_state, p,
12056                                           TRUE); /* means recognize comments */
12057
12058                 /* If the next thing is a quantifier, it applies to this
12059                  * character only, which means that this character has to be in
12060                  * its own node and can't just be appended to the string in an
12061                  * existing node, so if there are already other characters in
12062                  * the node, close the node with just them, and set up to do
12063                  * this character again next time through, when it will be the
12064                  * only thing in its new node */
12065                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12066                 {
12067                     p = oldp;
12068                     goto loopdone;
12069                 }
12070
12071                 if (! FOLD   /* The simple case, just append the literal */
12072                     || (LOC  /* Also don't fold for tricky chars under /l */
12073                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12074                 {
12075                     if (UTF) {
12076                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12077                         if (unilen > 0) {
12078                            s   += unilen;
12079                            len += unilen;
12080                         }
12081
12082                         /* The loop increments <len> each time, as all but this
12083                          * path (and one other) through it add a single byte to
12084                          * the EXACTish node.  But this one has changed len to
12085                          * be the correct final value, so subtract one to
12086                          * cancel out the increment that follows */
12087                         len--;
12088                     }
12089                     else {
12090                         REGC((char)ender, s++);
12091                     }
12092
12093                     /* Can get here if folding only if is one of the /l
12094                      * characters whose fold depends on the locale.  The
12095                      * occurrence of any of these indicate that we can't
12096                      * simplify things */
12097                     if (FOLD) {
12098                         maybe_exact = FALSE;
12099                         maybe_exactfu = FALSE;
12100                     }
12101                 }
12102                 else             /* FOLD */
12103                      if (! ( UTF
12104                         /* See comments for join_exact() as to why we fold this
12105                          * non-UTF at compile time */
12106                         || (node_type == EXACTFU
12107                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12108                 {
12109                     /* Here, are folding and are not UTF-8 encoded; therefore
12110                      * the character must be in the range 0-255, and is not /l
12111                      * (Not /l because we already handled these under /l in
12112                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12113                     if (IS_IN_SOME_FOLD_L1(ender)) {
12114                         maybe_exact = FALSE;
12115
12116                         /* See if the character's fold differs between /d and
12117                          * /u.  This includes the multi-char fold SHARP S to
12118                          * 'ss' */
12119                         if (maybe_exactfu
12120                             && (PL_fold[ender] != PL_fold_latin1[ender]
12121                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12122                                 || (len > 0
12123                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12124                                    && isARG2_lower_or_UPPER_ARG1('s',
12125                                                                  *(s-1)))))
12126                         {
12127                             maybe_exactfu = FALSE;
12128                         }
12129                     }
12130
12131                     /* Even when folding, we store just the input character, as
12132                      * we have an array that finds its fold quickly */
12133                     *(s++) = (char) ender;
12134                 }
12135                 else {  /* FOLD and UTF */
12136                     /* Unlike the non-fold case, we do actually have to
12137                      * calculate the results here in pass 1.  This is for two
12138                      * reasons, the folded length may be longer than the
12139                      * unfolded, and we have to calculate how many EXACTish
12140                      * nodes it will take; and we may run out of room in a node
12141                      * in the middle of a potential multi-char fold, and have
12142                      * to back off accordingly.  (Hence we can't use REGC for
12143                      * the simple case just below.) */
12144
12145                     UV folded;
12146                     if (isASCII(ender)) {
12147                         folded = toFOLD(ender);
12148                         *(s)++ = (U8) folded;
12149                     }
12150                     else {
12151                         STRLEN foldlen;
12152
12153                         folded = _to_uni_fold_flags(
12154                                      ender,
12155                                      (U8 *) s,
12156                                      &foldlen,
12157                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12158                                                         ? FOLD_FLAGS_NOMIX_ASCII
12159                                                         : 0));
12160                         s += foldlen;
12161
12162                         /* The loop increments <len> each time, as all but this
12163                          * path (and one other) through it add a single byte to
12164                          * the EXACTish node.  But this one has changed len to
12165                          * be the correct final value, so subtract one to
12166                          * cancel out the increment that follows */
12167                         len += foldlen - 1;
12168                     }
12169                     /* If this node only contains non-folding code points so
12170                      * far, see if this new one is also non-folding */
12171                     if (maybe_exact) {
12172                         if (folded != ender) {
12173                             maybe_exact = FALSE;
12174                         }
12175                         else {
12176                             /* Here the fold is the original; we have to check
12177                              * further to see if anything folds to it */
12178                             if (_invlist_contains_cp(PL_utf8_foldable,
12179                                                         ender))
12180                             {
12181                                 maybe_exact = FALSE;
12182                             }
12183                         }
12184                     }
12185                     ender = folded;
12186                 }
12187
12188                 if (next_is_quantifier) {
12189
12190                     /* Here, the next input is a quantifier, and to get here,
12191                      * the current character is the only one in the node.
12192                      * Also, here <len> doesn't include the final byte for this
12193                      * character */
12194                     len++;
12195                     goto loopdone;
12196                 }
12197
12198             } /* End of loop through literal characters */
12199
12200             /* Here we have either exhausted the input or ran out of room in
12201              * the node.  (If we encountered a character that can't be in the
12202              * node, transfer is made directly to <loopdone>, and so we
12203              * wouldn't have fallen off the end of the loop.)  In the latter
12204              * case, we artificially have to split the node into two, because
12205              * we just don't have enough space to hold everything.  This
12206              * creates a problem if the final character participates in a
12207              * multi-character fold in the non-final position, as a match that
12208              * should have occurred won't, due to the way nodes are matched,
12209              * and our artificial boundary.  So back off until we find a non-
12210              * problematic character -- one that isn't at the beginning or
12211              * middle of such a fold.  (Either it doesn't participate in any
12212              * folds, or appears only in the final position of all the folds it
12213              * does participate in.)  A better solution with far fewer false
12214              * positives, and that would fill the nodes more completely, would
12215              * be to actually have available all the multi-character folds to
12216              * test against, and to back-off only far enough to be sure that
12217              * this node isn't ending with a partial one.  <upper_parse> is set
12218              * further below (if we need to reparse the node) to include just
12219              * up through that final non-problematic character that this code
12220              * identifies, so when it is set to less than the full node, we can
12221              * skip the rest of this */
12222             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12223
12224                 const STRLEN full_len = len;
12225
12226                 assert(len >= MAX_NODE_STRING_SIZE);
12227
12228                 /* Here, <s> points to the final byte of the final character.
12229                  * Look backwards through the string until find a non-
12230                  * problematic character */
12231
12232                 if (! UTF) {
12233
12234                     /* This has no multi-char folds to non-UTF characters */
12235                     if (ASCII_FOLD_RESTRICTED) {
12236                         goto loopdone;
12237                     }
12238
12239                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12240                     len = s - s0 + 1;
12241                 }
12242                 else {
12243                     if (!  PL_NonL1NonFinalFold) {
12244                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12245                                         NonL1_Perl_Non_Final_Folds_invlist);
12246                     }
12247
12248                     /* Point to the first byte of the final character */
12249                     s = (char *) utf8_hop((U8 *) s, -1);
12250
12251                     while (s >= s0) {   /* Search backwards until find
12252                                            non-problematic char */
12253                         if (UTF8_IS_INVARIANT(*s)) {
12254
12255                             /* There are no ascii characters that participate
12256                              * in multi-char folds under /aa.  In EBCDIC, the
12257                              * non-ascii invariants are all control characters,
12258                              * so don't ever participate in any folds. */
12259                             if (ASCII_FOLD_RESTRICTED
12260                                 || ! IS_NON_FINAL_FOLD(*s))
12261                             {
12262                                 break;
12263                             }
12264                         }
12265                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12266                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12267                                                                   *s, *(s+1))))
12268                             {
12269                                 break;
12270                             }
12271                         }
12272                         else if (! _invlist_contains_cp(
12273                                         PL_NonL1NonFinalFold,
12274                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12275                         {
12276                             break;
12277                         }
12278
12279                         /* Here, the current character is problematic in that
12280                          * it does occur in the non-final position of some
12281                          * fold, so try the character before it, but have to
12282                          * special case the very first byte in the string, so
12283                          * we don't read outside the string */
12284                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12285                     } /* End of loop backwards through the string */
12286
12287                     /* If there were only problematic characters in the string,
12288                      * <s> will point to before s0, in which case the length
12289                      * should be 0, otherwise include the length of the
12290                      * non-problematic character just found */
12291                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12292                 }
12293
12294                 /* Here, have found the final character, if any, that is
12295                  * non-problematic as far as ending the node without splitting
12296                  * it across a potential multi-char fold.  <len> contains the
12297                  * number of bytes in the node up-to and including that
12298                  * character, or is 0 if there is no such character, meaning
12299                  * the whole node contains only problematic characters.  In
12300                  * this case, give up and just take the node as-is.  We can't
12301                  * do any better */
12302                 if (len == 0) {
12303                     len = full_len;
12304
12305                     /* If the node ends in an 's' we make sure it stays EXACTF,
12306                      * as if it turns into an EXACTFU, it could later get
12307                      * joined with another 's' that would then wrongly match
12308                      * the sharp s */
12309                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12310                     {
12311                         maybe_exactfu = FALSE;
12312                     }
12313                 } else {
12314
12315                     /* Here, the node does contain some characters that aren't
12316                      * problematic.  If one such is the final character in the
12317                      * node, we are done */
12318                     if (len == full_len) {
12319                         goto loopdone;
12320                     }
12321                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12322
12323                         /* If the final character is problematic, but the
12324                          * penultimate is not, back-off that last character to
12325                          * later start a new node with it */
12326                         p = oldp;
12327                         goto loopdone;
12328                     }
12329
12330                     /* Here, the final non-problematic character is earlier
12331                      * in the input than the penultimate character.  What we do
12332                      * is reparse from the beginning, going up only as far as
12333                      * this final ok one, thus guaranteeing that the node ends
12334                      * in an acceptable character.  The reason we reparse is
12335                      * that we know how far in the character is, but we don't
12336                      * know how to correlate its position with the input parse.
12337                      * An alternate implementation would be to build that
12338                      * correlation as we go along during the original parse,
12339                      * but that would entail extra work for every node, whereas
12340                      * this code gets executed only when the string is too
12341                      * large for the node, and the final two characters are
12342                      * problematic, an infrequent occurrence.  Yet another
12343                      * possible strategy would be to save the tail of the
12344                      * string, and the next time regatom is called, initialize
12345                      * with that.  The problem with this is that unless you
12346                      * back off one more character, you won't be guaranteed
12347                      * regatom will get called again, unless regbranch,
12348                      * regpiece ... are also changed.  If you do back off that
12349                      * extra character, so that there is input guaranteed to
12350                      * force calling regatom, you can't handle the case where
12351                      * just the first character in the node is acceptable.  I
12352                      * (khw) decided to try this method which doesn't have that
12353                      * pitfall; if performance issues are found, we can do a
12354                      * combination of the current approach plus that one */
12355                     upper_parse = len;
12356                     len = 0;
12357                     s = s0;
12358                     goto reparse;
12359                 }
12360             }   /* End of verifying node ends with an appropriate char */
12361
12362         loopdone:   /* Jumped to when encounters something that shouldn't be in
12363                        the node */
12364
12365             /* I (khw) don't know if you can get here with zero length, but the
12366              * old code handled this situation by creating a zero-length EXACT
12367              * node.  Might as well be NOTHING instead */
12368             if (len == 0) {
12369                 OP(ret) = NOTHING;
12370             }
12371             else {
12372                 if (FOLD) {
12373                     /* If 'maybe_exact' is still set here, means there are no
12374                      * code points in the node that participate in folds;
12375                      * similarly for 'maybe_exactfu' and code points that match
12376                      * differently depending on UTF8ness of the target string
12377                      * (for /u), or depending on locale for /l */
12378                     if (maybe_exact) {
12379                         OP(ret) = EXACT;
12380                     }
12381                     else if (maybe_exactfu) {
12382                         OP(ret) = EXACTFU;
12383                     }
12384                 }
12385                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12386                                            FALSE /* Don't look to see if could
12387                                                     be turned into an EXACT
12388                                                     node, as we have already
12389                                                     computed that */
12390                                           );
12391             }
12392
12393             RExC_parse = p - 1;
12394             Set_Node_Cur_Length(ret, parse_start);
12395             nextchar(pRExC_state);
12396             {
12397                 /* len is STRLEN which is unsigned, need to copy to signed */
12398                 IV iv = len;
12399                 if (iv < 0)
12400                     vFAIL("Internal disaster");
12401             }
12402
12403         } /* End of label 'defchar:' */
12404         break;
12405     } /* End of giant switch on input character */
12406
12407     return(ret);
12408 }
12409
12410 STATIC char *
12411 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12412 {
12413     /* Returns the next non-pattern-white space, non-comment character (the
12414      * latter only if 'recognize_comment is true) in the string p, which is
12415      * ended by RExC_end.  See also reg_skipcomment */
12416     const char *e = RExC_end;
12417
12418     PERL_ARGS_ASSERT_REGPATWS;
12419
12420     while (p < e) {
12421         STRLEN len;
12422         if ((len = is_PATWS_safe(p, e, UTF))) {
12423             p += len;
12424         }
12425         else if (recognize_comment && *p == '#') {
12426             p = reg_skipcomment(pRExC_state, p);
12427         }
12428         else
12429             break;
12430     }
12431     return p;
12432 }
12433
12434 STATIC void
12435 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12436 {
12437     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12438      * sets up the bitmap and any flags, removing those code points from the
12439      * inversion list, setting it to NULL should it become completely empty */
12440
12441     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12442     assert(PL_regkind[OP(node)] == ANYOF);
12443
12444     ANYOF_BITMAP_ZERO(node);
12445     if (*invlist_ptr) {
12446
12447         /* This gets set if we actually need to modify things */
12448         bool change_invlist = FALSE;
12449
12450         UV start, end;
12451
12452         /* Start looking through *invlist_ptr */
12453         invlist_iterinit(*invlist_ptr);
12454         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12455             UV high;
12456             int i;
12457
12458             if (end == UV_MAX && start <= 256) {
12459                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12460             }
12461             else if (end >= 256) {
12462                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12463             }
12464
12465             /* Quit if are above what we should change */
12466             if (start > 255) {
12467                 break;
12468             }
12469
12470             change_invlist = TRUE;
12471
12472             /* Set all the bits in the range, up to the max that we are doing */
12473             high = (end < 255) ? end : 255;
12474             for (i = start; i <= (int) high; i++) {
12475                 if (! ANYOF_BITMAP_TEST(node, i)) {
12476                     ANYOF_BITMAP_SET(node, i);
12477                 }
12478             }
12479         }
12480         invlist_iterfinish(*invlist_ptr);
12481
12482         /* Done with loop; remove any code points that are in the bitmap from
12483          * *invlist_ptr; similarly for code points above latin1 if we have a
12484          * flag to match all of them anyways */
12485         if (change_invlist) {
12486             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12487         }
12488         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12489             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12490         }
12491
12492         /* If have completely emptied it, remove it completely */
12493         if (_invlist_len(*invlist_ptr) == 0) {
12494             SvREFCNT_dec_NN(*invlist_ptr);
12495             *invlist_ptr = NULL;
12496         }
12497     }
12498 }
12499
12500 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12501    Character classes ([:foo:]) can also be negated ([:^foo:]).
12502    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12503    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12504    but trigger failures because they are currently unimplemented. */
12505
12506 #define POSIXCC_DONE(c)   ((c) == ':')
12507 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12508 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12509
12510 PERL_STATIC_INLINE I32
12511 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12512 {
12513     dVAR;
12514     I32 namedclass = OOB_NAMEDCLASS;
12515
12516     PERL_ARGS_ASSERT_REGPPOSIXCC;
12517
12518     if (value == '[' && RExC_parse + 1 < RExC_end &&
12519         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12520         POSIXCC(UCHARAT(RExC_parse)))
12521     {
12522         const char c = UCHARAT(RExC_parse);
12523         char* const s = RExC_parse++;
12524
12525         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12526             RExC_parse++;
12527         if (RExC_parse == RExC_end) {
12528             if (strict) {
12529
12530                 /* Try to give a better location for the error (than the end of
12531                  * the string) by looking for the matching ']' */
12532                 RExC_parse = s;
12533                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12534                     RExC_parse++;
12535                 }
12536                 vFAIL2("Unmatched '%c' in POSIX class", c);
12537             }
12538             /* Grandfather lone [:, [=, [. */
12539             RExC_parse = s;
12540         }
12541         else {
12542             const char* const t = RExC_parse++; /* skip over the c */
12543             assert(*t == c);
12544
12545             if (UCHARAT(RExC_parse) == ']') {
12546                 const char *posixcc = s + 1;
12547                 RExC_parse++; /* skip over the ending ] */
12548
12549                 if (*s == ':') {
12550                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12551                     const I32 skip = t - posixcc;
12552
12553                     /* Initially switch on the length of the name.  */
12554                     switch (skip) {
12555                     case 4:
12556                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12557                                                           this is the Perl \w
12558                                                         */
12559                             namedclass = ANYOF_WORDCHAR;
12560                         break;
12561                     case 5:
12562                         /* Names all of length 5.  */
12563                         /* alnum alpha ascii blank cntrl digit graph lower
12564                            print punct space upper  */
12565                         /* Offset 4 gives the best switch position.  */
12566                         switch (posixcc[4]) {
12567                         case 'a':
12568                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12569                                 namedclass = ANYOF_ALPHA;
12570                             break;
12571                         case 'e':
12572                             if (memEQ(posixcc, "spac", 4)) /* space */
12573                                 namedclass = ANYOF_PSXSPC;
12574                             break;
12575                         case 'h':
12576                             if (memEQ(posixcc, "grap", 4)) /* graph */
12577                                 namedclass = ANYOF_GRAPH;
12578                             break;
12579                         case 'i':
12580                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12581                                 namedclass = ANYOF_ASCII;
12582                             break;
12583                         case 'k':
12584                             if (memEQ(posixcc, "blan", 4)) /* blank */
12585                                 namedclass = ANYOF_BLANK;
12586                             break;
12587                         case 'l':
12588                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12589                                 namedclass = ANYOF_CNTRL;
12590                             break;
12591                         case 'm':
12592                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12593                                 namedclass = ANYOF_ALPHANUMERIC;
12594                             break;
12595                         case 'r':
12596                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12597                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12598                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12599                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12600                             break;
12601                         case 't':
12602                             if (memEQ(posixcc, "digi", 4)) /* digit */
12603                                 namedclass = ANYOF_DIGIT;
12604                             else if (memEQ(posixcc, "prin", 4)) /* print */
12605                                 namedclass = ANYOF_PRINT;
12606                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12607                                 namedclass = ANYOF_PUNCT;
12608                             break;
12609                         }
12610                         break;
12611                     case 6:
12612                         if (memEQ(posixcc, "xdigit", 6))
12613                             namedclass = ANYOF_XDIGIT;
12614                         break;
12615                     }
12616
12617                     if (namedclass == OOB_NAMEDCLASS)
12618                         vFAIL2utf8f(
12619                             "POSIX class [:%"UTF8f":] unknown",
12620                             UTF8fARG(UTF, t - s - 1, s + 1));
12621
12622                     /* The #defines are structured so each complement is +1 to
12623                      * the normal one */
12624                     if (complement) {
12625                         namedclass++;
12626                     }
12627                     assert (posixcc[skip] == ':');
12628                     assert (posixcc[skip+1] == ']');
12629                 } else if (!SIZE_ONLY) {
12630                     /* [[=foo=]] and [[.foo.]] are still future. */
12631
12632                     /* adjust RExC_parse so the warning shows after
12633                        the class closes */
12634                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12635                         RExC_parse++;
12636                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12637                 }
12638             } else {
12639                 /* Maternal grandfather:
12640                  * "[:" ending in ":" but not in ":]" */
12641                 if (strict) {
12642                     vFAIL("Unmatched '[' in POSIX class");
12643                 }
12644
12645                 /* Grandfather lone [:, [=, [. */
12646                 RExC_parse = s;
12647             }
12648         }
12649     }
12650
12651     return namedclass;
12652 }
12653
12654 STATIC bool
12655 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12656 {
12657     /* This applies some heuristics at the current parse position (which should
12658      * be at a '[') to see if what follows might be intended to be a [:posix:]
12659      * class.  It returns true if it really is a posix class, of course, but it
12660      * also can return true if it thinks that what was intended was a posix
12661      * class that didn't quite make it.
12662      *
12663      * It will return true for
12664      *      [:alphanumerics:
12665      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12666      *                         ')' indicating the end of the (?[
12667      *      [:any garbage including %^&$ punctuation:]
12668      *
12669      * This is designed to be called only from S_handle_regex_sets; it could be
12670      * easily adapted to be called from the spot at the beginning of regclass()
12671      * that checks to see in a normal bracketed class if the surrounding []
12672      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12673      * change long-standing behavior, so I (khw) didn't do that */
12674     char* p = RExC_parse + 1;
12675     char first_char = *p;
12676
12677     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12678
12679     assert(*(p - 1) == '[');
12680
12681     if (! POSIXCC(first_char)) {
12682         return FALSE;
12683     }
12684
12685     p++;
12686     while (p < RExC_end && isWORDCHAR(*p)) p++;
12687
12688     if (p >= RExC_end) {
12689         return FALSE;
12690     }
12691
12692     if (p - RExC_parse > 2    /* Got at least 1 word character */
12693         && (*p == first_char
12694             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12695     {
12696         return TRUE;
12697     }
12698
12699     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12700
12701     return (p
12702             && p - RExC_parse > 2 /* [:] evaluates to colon;
12703                                       [::] is a bad posix class. */
12704             && first_char == *(p - 1));
12705 }
12706
12707 STATIC regnode *
12708 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12709                     I32 *flagp, U32 depth,
12710                     char * const oregcomp_parse)
12711 {
12712     /* Handle the (?[...]) construct to do set operations */
12713
12714     U8 curchar;
12715     UV start, end;      /* End points of code point ranges */
12716     SV* result_string;
12717     char *save_end, *save_parse;
12718     SV* final;
12719     STRLEN len;
12720     regnode* node;
12721     AV* stack;
12722     const bool save_fold = FOLD;
12723
12724     GET_RE_DEBUG_FLAGS_DECL;
12725
12726     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12727
12728     if (LOC) {
12729         vFAIL("(?[...]) not valid in locale");
12730     }
12731     RExC_uni_semantics = 1;
12732
12733     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12734      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12735      * call regclass to handle '[]' so as to not have to reinvent its parsing
12736      * rules here (throwing away the size it computes each time).  And, we exit
12737      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12738      * these things, we need to realize that something preceded by a backslash
12739      * is escaped, so we have to keep track of backslashes */
12740     if (SIZE_ONLY) {
12741         UV depth = 0; /* how many nested (?[...]) constructs */
12742
12743         Perl_ck_warner_d(aTHX_
12744             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12745             "The regex_sets feature is experimental" REPORT_LOCATION,
12746                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12747                 UTF8fARG(UTF,
12748                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12749                          RExC_precomp + (RExC_parse - RExC_precomp)));
12750
12751         while (RExC_parse < RExC_end) {
12752             SV* current = NULL;
12753             RExC_parse = regpatws(pRExC_state, RExC_parse,
12754                                           TRUE); /* means recognize comments */
12755             switch (*RExC_parse) {
12756                 case '?':
12757                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12758                     /* FALLTHROUGH */
12759                 default:
12760                     break;
12761                 case '\\':
12762                     /* Skip the next byte (which could cause us to end up in
12763                      * the middle of a UTF-8 character, but since none of those
12764                      * are confusable with anything we currently handle in this
12765                      * switch (invariants all), it's safe.  We'll just hit the
12766                      * default: case next time and keep on incrementing until
12767                      * we find one of the invariants we do handle. */
12768                     RExC_parse++;
12769                     break;
12770                 case '[':
12771                 {
12772                     /* If this looks like it is a [:posix:] class, leave the
12773                      * parse pointer at the '[' to fool regclass() into
12774                      * thinking it is part of a '[[:posix:]]'.  That function
12775                      * will use strict checking to force a syntax error if it
12776                      * doesn't work out to a legitimate class */
12777                     bool is_posix_class
12778                                     = could_it_be_a_POSIX_class(pRExC_state);
12779                     if (! is_posix_class) {
12780                         RExC_parse++;
12781                     }
12782
12783                     /* regclass() can only return RESTART_UTF8 if multi-char
12784                        folds are allowed.  */
12785                     if (!regclass(pRExC_state, flagp,depth+1,
12786                                   is_posix_class, /* parse the whole char
12787                                                      class only if not a
12788                                                      posix class */
12789                                   FALSE, /* don't allow multi-char folds */
12790                                   TRUE, /* silence non-portable warnings. */
12791                                   &current))
12792                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12793                               (UV) *flagp);
12794
12795                     /* function call leaves parse pointing to the ']', except
12796                      * if we faked it */
12797                     if (is_posix_class) {
12798                         RExC_parse--;
12799                     }
12800
12801                     SvREFCNT_dec(current);   /* In case it returned something */
12802                     break;
12803                 }
12804
12805                 case ']':
12806                     if (depth--) break;
12807                     RExC_parse++;
12808                     if (RExC_parse < RExC_end
12809                         && *RExC_parse == ')')
12810                     {
12811                         node = reganode(pRExC_state, ANYOF, 0);
12812                         RExC_size += ANYOF_SKIP;
12813                         nextchar(pRExC_state);
12814                         Set_Node_Length(node,
12815                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12816                         return node;
12817                     }
12818                     goto no_close;
12819             }
12820             RExC_parse++;
12821         }
12822
12823         no_close:
12824         FAIL("Syntax error in (?[...])");
12825     }
12826
12827     /* Pass 2 only after this.  Everything in this construct is a
12828      * metacharacter.  Operands begin with either a '\' (for an escape
12829      * sequence), or a '[' for a bracketed character class.  Any other
12830      * character should be an operator, or parenthesis for grouping.  Both
12831      * types of operands are handled by calling regclass() to parse them.  It
12832      * is called with a parameter to indicate to return the computed inversion
12833      * list.  The parsing here is implemented via a stack.  Each entry on the
12834      * stack is a single character representing one of the operators, or the
12835      * '('; or else a pointer to an operand inversion list. */
12836
12837 #define IS_OPERAND(a)  (! SvIOK(a))
12838
12839     /* The stack starts empty.  It is a syntax error if the first thing parsed
12840      * is a binary operator; everything else is pushed on the stack.  When an
12841      * operand is parsed, the top of the stack is examined.  If it is a binary
12842      * operator, the item before it should be an operand, and both are replaced
12843      * by the result of doing that operation on the new operand and the one on
12844      * the stack.   Thus a sequence of binary operands is reduced to a single
12845      * one before the next one is parsed.
12846      *
12847      * A unary operator may immediately follow a binary in the input, for
12848      * example
12849      *      [a] + ! [b]
12850      * When an operand is parsed and the top of the stack is a unary operator,
12851      * the operation is performed, and then the stack is rechecked to see if
12852      * this new operand is part of a binary operation; if so, it is handled as
12853      * above.
12854      *
12855      * A '(' is simply pushed on the stack; it is valid only if the stack is
12856      * empty, or the top element of the stack is an operator or another '('
12857      * (for which the parenthesized expression will become an operand).  By the
12858      * time the corresponding ')' is parsed everything in between should have
12859      * been parsed and evaluated to a single operand (or else is a syntax
12860      * error), and is handled as a regular operand */
12861
12862     sv_2mortal((SV *)(stack = newAV()));
12863
12864     while (RExC_parse < RExC_end) {
12865         I32 top_index = av_tindex(stack);
12866         SV** top_ptr;
12867         SV* current = NULL;
12868
12869         /* Skip white space */
12870         RExC_parse = regpatws(pRExC_state, RExC_parse,
12871                                          TRUE /* means recognize comments */ );
12872         if (RExC_parse >= RExC_end) {
12873             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12874         }
12875         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12876             break;
12877         }
12878
12879         switch (curchar) {
12880
12881             case '?':
12882                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12883                                                safely subtract 1 from
12884                                                RExC_parse in the next clause.
12885                                                If we have something on the
12886                                                stack, we have parsed something
12887                                              */
12888                     && UCHARAT(RExC_parse - 1) == '('
12889                     && RExC_parse < RExC_end)
12890                 {
12891                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12892                      * This happens when we have some thing like
12893                      *
12894                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12895                      *   ...
12896                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12897                      *
12898                      * Here we would be handling the interpolated
12899                      * '$thai_or_lao'.  We handle this by a recursive call to
12900                      * ourselves which returns the inversion list the
12901                      * interpolated expression evaluates to.  We use the flags
12902                      * from the interpolated pattern. */
12903                     U32 save_flags = RExC_flags;
12904                     const char * const save_parse = ++RExC_parse;
12905
12906                     parse_lparen_question_flags(pRExC_state);
12907
12908                     if (RExC_parse == save_parse  /* Makes sure there was at
12909                                                      least one flag (or this
12910                                                      embedding wasn't compiled)
12911                                                    */
12912                         || RExC_parse >= RExC_end - 4
12913                         || UCHARAT(RExC_parse) != ':'
12914                         || UCHARAT(++RExC_parse) != '('
12915                         || UCHARAT(++RExC_parse) != '?'
12916                         || UCHARAT(++RExC_parse) != '[')
12917                     {
12918
12919                         /* In combination with the above, this moves the
12920                          * pointer to the point just after the first erroneous
12921                          * character (or if there are no flags, to where they
12922                          * should have been) */
12923                         if (RExC_parse >= RExC_end - 4) {
12924                             RExC_parse = RExC_end;
12925                         }
12926                         else if (RExC_parse != save_parse) {
12927                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12928                         }
12929                         vFAIL("Expecting '(?flags:(?[...'");
12930                     }
12931                     RExC_parse++;
12932                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12933                                                     depth+1, oregcomp_parse);
12934
12935                     /* Here, 'current' contains the embedded expression's
12936                      * inversion list, and RExC_parse points to the trailing
12937                      * ']'; the next character should be the ')' which will be
12938                      * paired with the '(' that has been put on the stack, so
12939                      * the whole embedded expression reduces to '(operand)' */
12940                     RExC_parse++;
12941
12942                     RExC_flags = save_flags;
12943                     goto handle_operand;
12944                 }
12945                 /* FALLTHROUGH */
12946
12947             default:
12948                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12949                 vFAIL("Unexpected character");
12950
12951             case '\\':
12952                 /* regclass() can only return RESTART_UTF8 if multi-char
12953                    folds are allowed.  */
12954                 if (!regclass(pRExC_state, flagp,depth+1,
12955                               TRUE, /* means parse just the next thing */
12956                               FALSE, /* don't allow multi-char folds */
12957                               FALSE, /* don't silence non-portable warnings.  */
12958                               &current))
12959                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12960                           (UV) *flagp);
12961                 /* regclass() will return with parsing just the \ sequence,
12962                  * leaving the parse pointer at the next thing to parse */
12963                 RExC_parse--;
12964                 goto handle_operand;
12965
12966             case '[':   /* Is a bracketed character class */
12967             {
12968                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12969
12970                 if (! is_posix_class) {
12971                     RExC_parse++;
12972                 }
12973
12974                 /* regclass() can only return RESTART_UTF8 if multi-char
12975                    folds are allowed.  */
12976                 if(!regclass(pRExC_state, flagp,depth+1,
12977                              is_posix_class, /* parse the whole char class
12978                                                 only if not a posix class */
12979                              FALSE, /* don't allow multi-char folds */
12980                              FALSE, /* don't silence non-portable warnings.  */
12981                              &current))
12982                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12983                           (UV) *flagp);
12984                 /* function call leaves parse pointing to the ']', except if we
12985                  * faked it */
12986                 if (is_posix_class) {
12987                     RExC_parse--;
12988                 }
12989
12990                 goto handle_operand;
12991             }
12992
12993             case '&':
12994             case '|':
12995             case '+':
12996             case '-':
12997             case '^':
12998                 if (top_index < 0
12999                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13000                     || ! IS_OPERAND(*top_ptr))
13001                 {
13002                     RExC_parse++;
13003                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13004                 }
13005                 av_push(stack, newSVuv(curchar));
13006                 break;
13007
13008             case '!':
13009                 av_push(stack, newSVuv(curchar));
13010                 break;
13011
13012             case '(':
13013                 if (top_index >= 0) {
13014                     top_ptr = av_fetch(stack, top_index, FALSE);
13015                     assert(top_ptr);
13016                     if (IS_OPERAND(*top_ptr)) {
13017                         RExC_parse++;
13018                         vFAIL("Unexpected '(' with no preceding operator");
13019                     }
13020                 }
13021                 av_push(stack, newSVuv(curchar));
13022                 break;
13023
13024             case ')':
13025             {
13026                 SV* lparen;
13027                 if (top_index < 1
13028                     || ! (current = av_pop(stack))
13029                     || ! IS_OPERAND(current)
13030                     || ! (lparen = av_pop(stack))
13031                     || IS_OPERAND(lparen)
13032                     || SvUV(lparen) != '(')
13033                 {
13034                     SvREFCNT_dec(current);
13035                     RExC_parse++;
13036                     vFAIL("Unexpected ')'");
13037                 }
13038                 top_index -= 2;
13039                 SvREFCNT_dec_NN(lparen);
13040
13041                 /* FALLTHROUGH */
13042             }
13043
13044               handle_operand:
13045
13046                 /* Here, we have an operand to process, in 'current' */
13047
13048                 if (top_index < 0) {    /* Just push if stack is empty */
13049                     av_push(stack, current);
13050                 }
13051                 else {
13052                     SV* top = av_pop(stack);
13053                     SV *prev = NULL;
13054                     char current_operator;
13055
13056                     if (IS_OPERAND(top)) {
13057                         SvREFCNT_dec_NN(top);
13058                         SvREFCNT_dec_NN(current);
13059                         vFAIL("Operand with no preceding operator");
13060                     }
13061                     current_operator = (char) SvUV(top);
13062                     switch (current_operator) {
13063                         case '(':   /* Push the '(' back on followed by the new
13064                                        operand */
13065                             av_push(stack, top);
13066                             av_push(stack, current);
13067                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13068                                                    just after the 'break', so
13069                                                    it doesn't get wrongly freed
13070                                                  */
13071                             break;
13072
13073                         case '!':
13074                             _invlist_invert(current);
13075
13076                             /* Unlike binary operators, the top of the stack,
13077                              * now that this unary one has been popped off, may
13078                              * legally be an operator, and we now have operand
13079                              * for it. */
13080                             top_index--;
13081                             SvREFCNT_dec_NN(top);
13082                             goto handle_operand;
13083
13084                         case '&':
13085                             prev = av_pop(stack);
13086                             _invlist_intersection(prev,
13087                                                    current,
13088                                                    &current);
13089                             av_push(stack, current);
13090                             break;
13091
13092                         case '|':
13093                         case '+':
13094                             prev = av_pop(stack);
13095                             _invlist_union(prev, current, &current);
13096                             av_push(stack, current);
13097                             break;
13098
13099                         case '-':
13100                             prev = av_pop(stack);;
13101                             _invlist_subtract(prev, current, &current);
13102                             av_push(stack, current);
13103                             break;
13104
13105                         case '^':   /* The union minus the intersection */
13106                         {
13107                             SV* i = NULL;
13108                             SV* u = NULL;
13109                             SV* element;
13110
13111                             prev = av_pop(stack);
13112                             _invlist_union(prev, current, &u);
13113                             _invlist_intersection(prev, current, &i);
13114                             /* _invlist_subtract will overwrite current
13115                                 without freeing what it already contains */
13116                             element = current;
13117                             _invlist_subtract(u, i, &current);
13118                             av_push(stack, current);
13119                             SvREFCNT_dec_NN(i);
13120                             SvREFCNT_dec_NN(u);
13121                             SvREFCNT_dec_NN(element);
13122                             break;
13123                         }
13124
13125                         default:
13126                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13127                 }
13128                 SvREFCNT_dec_NN(top);
13129                 SvREFCNT_dec(prev);
13130             }
13131         }
13132
13133         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13134     }
13135
13136     if (av_tindex(stack) < 0   /* Was empty */
13137         || ((final = av_pop(stack)) == NULL)
13138         || ! IS_OPERAND(final)
13139         || av_tindex(stack) >= 0)  /* More left on stack */
13140     {
13141         vFAIL("Incomplete expression within '(?[ ])'");
13142     }
13143
13144     /* Here, 'final' is the resultant inversion list from evaluating the
13145      * expression.  Return it if so requested */
13146     if (return_invlist) {
13147         *return_invlist = final;
13148         return END;
13149     }
13150
13151     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13152      * expecting a string of ranges and individual code points */
13153     invlist_iterinit(final);
13154     result_string = newSVpvs("");
13155     while (invlist_iternext(final, &start, &end)) {
13156         if (start == end) {
13157             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13158         }
13159         else {
13160             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13161                                                      start,          end);
13162         }
13163     }
13164
13165     save_parse = RExC_parse;
13166     RExC_parse = SvPV(result_string, len);
13167     save_end = RExC_end;
13168     RExC_end = RExC_parse + len;
13169
13170     /* We turn off folding around the call, as the class we have constructed
13171      * already has all folding taken into consideration, and we don't want
13172      * regclass() to add to that */
13173     RExC_flags &= ~RXf_PMf_FOLD;
13174     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13175      */
13176     node = regclass(pRExC_state, flagp,depth+1,
13177                     FALSE, /* means parse the whole char class */
13178                     FALSE, /* don't allow multi-char folds */
13179                     TRUE, /* silence non-portable warnings.  The above may very
13180                              well have generated non-portable code points, but
13181                              they're valid on this machine */
13182                     NULL);
13183     if (!node)
13184         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13185                     PTR2UV(flagp));
13186     if (save_fold) {
13187         RExC_flags |= RXf_PMf_FOLD;
13188     }
13189     RExC_parse = save_parse + 1;
13190     RExC_end = save_end;
13191     SvREFCNT_dec_NN(final);
13192     SvREFCNT_dec_NN(result_string);
13193
13194     nextchar(pRExC_state);
13195     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13196     return node;
13197 }
13198 #undef IS_OPERAND
13199
13200 STATIC void
13201 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13202 {
13203     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13204      * innocent-looking character class, like /[ks]/i won't have to go out to
13205      * disk to find the possible matches.
13206      *
13207      * This should be called only for a Latin1-range code points, cp, which is
13208      * known to be involved in a fold with other code points above Latin1.  It
13209      * would give false results if /aa has been specified.  Multi-char folds
13210      * are outside the scope of this, and must be handled specially.
13211      *
13212      * XXX It would be better to generate these via regen, in case a new
13213      * version of the Unicode standard adds new mappings, though that is not
13214      * really likely, and may be caught by the default: case of the switch
13215      * below. */
13216
13217     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13218
13219     switch (cp) {
13220         case 'k':
13221         case 'K':
13222           *invlist =
13223              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13224             break;
13225         case 's':
13226         case 'S':
13227           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13228             break;
13229         case MICRO_SIGN:
13230           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13231           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13232             break;
13233         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13234         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13235           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13236             break;
13237         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13238           *invlist = add_cp_to_invlist(*invlist,
13239                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13240             break;
13241         case LATIN_SMALL_LETTER_SHARP_S:
13242           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13243             break;
13244         case 'F': case 'f':
13245         case 'I': case 'i':
13246         case 'L': case 'l':
13247         case 'T': case 't':
13248         case 'A': case 'a':
13249         case 'H': case 'h':
13250         case 'J': case 'j':
13251         case 'N': case 'n':
13252         case 'W': case 'w':
13253         case 'Y': case 'y':
13254             /* These all are targets of multi-character folds from code points
13255              * that require UTF8 to express, so they can't match unless the
13256              * target string is in UTF-8, so no action here is necessary, as
13257              * regexec.c properly handles the general case for UTF-8 matching
13258              * and multi-char folds */
13259             break;
13260         default:
13261             /* Use deprecated warning to increase the chances of this being
13262              * output */
13263             ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13264             break;
13265     }
13266 }
13267
13268 /* The names of properties whose definitions are not known at compile time are
13269  * stored in this SV, after a constant heading.  So if the length has been
13270  * changed since initialization, then there is a run-time definition. */
13271 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13272                                         (SvCUR(listsv) != initial_listsv_len)
13273
13274 STATIC regnode *
13275 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13276                  const bool stop_at_1,  /* Just parse the next thing, don't
13277                                            look for a full character class */
13278                  bool allow_multi_folds,
13279                  const bool silence_non_portable,   /* Don't output warnings
13280                                                        about too large
13281                                                        characters */
13282                  SV** ret_invlist)  /* Return an inversion list, not a node */
13283 {
13284     /* parse a bracketed class specification.  Most of these will produce an
13285      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13286      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13287      * under /i with multi-character folds: it will be rewritten following the
13288      * paradigm of this example, where the <multi-fold>s are characters which
13289      * fold to multiple character sequences:
13290      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13291      * gets effectively rewritten as:
13292      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13293      * reg() gets called (recursively) on the rewritten version, and this
13294      * function will return what it constructs.  (Actually the <multi-fold>s
13295      * aren't physically removed from the [abcdefghi], it's just that they are
13296      * ignored in the recursion by means of a flag:
13297      * <RExC_in_multi_char_class>.)
13298      *
13299      * ANYOF nodes contain a bit map for the first 256 characters, with the
13300      * corresponding bit set if that character is in the list.  For characters
13301      * above 255, a range list or swash is used.  There are extra bits for \w,
13302      * etc. in locale ANYOFs, as what these match is not determinable at
13303      * compile time
13304      *
13305      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13306      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13307      */
13308
13309     dVAR;
13310     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13311     IV range = 0;
13312     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13313     regnode *ret;
13314     STRLEN numlen;
13315     IV namedclass = OOB_NAMEDCLASS;
13316     char *rangebegin = NULL;
13317     bool need_class = 0;
13318     SV *listsv = NULL;
13319     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13320                                       than just initialized.  */
13321     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13322     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13323                                extended beyond the Latin1 range.  These have to
13324                                be kept separate from other code points for much
13325                                of this function because their handling  is
13326                                different under /i, and for most classes under
13327                                /d as well */
13328     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13329                                separate for a while from the non-complemented
13330                                versions because of complications with /d
13331                                matching */
13332     UV element_count = 0;   /* Number of distinct elements in the class.
13333                                Optimizations may be possible if this is tiny */
13334     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13335                                        character; used under /i */
13336     UV n;
13337     char * stop_ptr = RExC_end;    /* where to stop parsing */
13338     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13339                                                    space? */
13340     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13341
13342     /* Unicode properties are stored in a swash; this holds the current one
13343      * being parsed.  If this swash is the only above-latin1 component of the
13344      * character class, an optimization is to pass it directly on to the
13345      * execution engine.  Otherwise, it is set to NULL to indicate that there
13346      * are other things in the class that have to be dealt with at execution
13347      * time */
13348     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13349
13350     /* Set if a component of this character class is user-defined; just passed
13351      * on to the engine */
13352     bool has_user_defined_property = FALSE;
13353
13354     /* inversion list of code points this node matches only when the target
13355      * string is in UTF-8.  (Because is under /d) */
13356     SV* depends_list = NULL;
13357
13358     /* Inversion list of code points this node matches regardless of things
13359      * like locale, folding, utf8ness of the target string */
13360     SV* cp_list = NULL;
13361
13362     /* Like cp_list, but code points on this list need to be checked for things
13363      * that fold to/from them under /i */
13364     SV* cp_foldable_list = NULL;
13365
13366     /* Like cp_list, but code points on this list are valid only when the
13367      * runtime locale is UTF-8 */
13368     SV* only_utf8_locale_list = NULL;
13369
13370 #ifdef EBCDIC
13371     /* In a range, counts how many 0-2 of the ends of it came from literals,
13372      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13373     UV literal_endpoint = 0;
13374 #endif
13375     bool invert = FALSE;    /* Is this class to be complemented */
13376
13377     bool warn_super = ALWAYS_WARN_SUPER;
13378
13379     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13380         case we need to change the emitted regop to an EXACT. */
13381     const char * orig_parse = RExC_parse;
13382     const SSize_t orig_size = RExC_size;
13383     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13384     GET_RE_DEBUG_FLAGS_DECL;
13385
13386     PERL_ARGS_ASSERT_REGCLASS;
13387 #ifndef DEBUGGING
13388     PERL_UNUSED_ARG(depth);
13389 #endif
13390
13391     DEBUG_PARSE("clas");
13392
13393     /* Assume we are going to generate an ANYOF node. */
13394     ret = reganode(pRExC_state, ANYOF, 0);
13395
13396     if (SIZE_ONLY) {
13397         RExC_size += ANYOF_SKIP;
13398         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13399     }
13400     else {
13401         ANYOF_FLAGS(ret) = 0;
13402
13403         RExC_emit += ANYOF_SKIP;
13404         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13405         initial_listsv_len = SvCUR(listsv);
13406         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13407     }
13408
13409     if (skip_white) {
13410         RExC_parse = regpatws(pRExC_state, RExC_parse,
13411                               FALSE /* means don't recognize comments */ );
13412     }
13413
13414     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13415         RExC_parse++;
13416         invert = TRUE;
13417         allow_multi_folds = FALSE;
13418         RExC_naughty++;
13419         if (skip_white) {
13420             RExC_parse = regpatws(pRExC_state, RExC_parse,
13421                                   FALSE /* means don't recognize comments */ );
13422         }
13423     }
13424
13425     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13426     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13427         const char *s = RExC_parse;
13428         const char  c = *s++;
13429
13430         while (isWORDCHAR(*s))
13431             s++;
13432         if (*s && c == *s && s[1] == ']') {
13433             SAVEFREESV(RExC_rx_sv);
13434             ckWARN3reg(s+2,
13435                        "POSIX syntax [%c %c] belongs inside character classes",
13436                        c, c);
13437             (void)ReREFCNT_inc(RExC_rx_sv);
13438         }
13439     }
13440
13441     /* If the caller wants us to just parse a single element, accomplish this
13442      * by faking the loop ending condition */
13443     if (stop_at_1 && RExC_end > RExC_parse) {
13444         stop_ptr = RExC_parse + 1;
13445     }
13446
13447     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13448     if (UCHARAT(RExC_parse) == ']')
13449         goto charclassloop;
13450
13451 parseit:
13452     while (1) {
13453         if  (RExC_parse >= stop_ptr) {
13454             break;
13455         }
13456
13457         if (skip_white) {
13458             RExC_parse = regpatws(pRExC_state, RExC_parse,
13459                                   FALSE /* means don't recognize comments */ );
13460         }
13461
13462         if  (UCHARAT(RExC_parse) == ']') {
13463             break;
13464         }
13465
13466     charclassloop:
13467
13468         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13469         save_value = value;
13470         save_prevvalue = prevvalue;
13471
13472         if (!range) {
13473             rangebegin = RExC_parse;
13474             element_count++;
13475         }
13476         if (UTF) {
13477             value = utf8n_to_uvchr((U8*)RExC_parse,
13478                                    RExC_end - RExC_parse,
13479                                    &numlen, UTF8_ALLOW_DEFAULT);
13480             RExC_parse += numlen;
13481         }
13482         else
13483             value = UCHARAT(RExC_parse++);
13484
13485         if (value == '['
13486             && RExC_parse < RExC_end
13487             && POSIXCC(UCHARAT(RExC_parse)))
13488         {
13489             namedclass = regpposixcc(pRExC_state, value, strict);
13490         }
13491         else if (value == '\\') {
13492             if (UTF) {
13493                 value = utf8n_to_uvchr((U8*)RExC_parse,
13494                                    RExC_end - RExC_parse,
13495                                    &numlen, UTF8_ALLOW_DEFAULT);
13496                 RExC_parse += numlen;
13497             }
13498             else
13499                 value = UCHARAT(RExC_parse++);
13500
13501             /* Some compilers cannot handle switching on 64-bit integer
13502              * values, therefore value cannot be an UV.  Yes, this will
13503              * be a problem later if we want switch on Unicode.
13504              * A similar issue a little bit later when switching on
13505              * namedclass. --jhi */
13506
13507             /* If the \ is escaping white space when white space is being
13508              * skipped, it means that that white space is wanted literally, and
13509              * is already in 'value'.  Otherwise, need to translate the escape
13510              * into what it signifies. */
13511             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13512
13513             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13514             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13515             case 's':   namedclass = ANYOF_SPACE;       break;
13516             case 'S':   namedclass = ANYOF_NSPACE;      break;
13517             case 'd':   namedclass = ANYOF_DIGIT;       break;
13518             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13519             case 'v':   namedclass = ANYOF_VERTWS;      break;
13520             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13521             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13522             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13523             case 'N':  /* Handle \N{NAME} in class */
13524                 {
13525                     /* We only pay attention to the first char of
13526                     multichar strings being returned. I kinda wonder
13527                     if this makes sense as it does change the behaviour
13528                     from earlier versions, OTOH that behaviour was broken
13529                     as well. */
13530                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13531                                       TRUE, /* => charclass */
13532                                       strict))
13533                     {
13534                         if (*flagp & RESTART_UTF8)
13535                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13536                         goto parseit;
13537                     }
13538                 }
13539                 break;
13540             case 'p':
13541             case 'P':
13542                 {
13543                 char *e;
13544
13545                 /* We will handle any undefined properties ourselves */
13546                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13547                                        /* And we actually would prefer to get
13548                                         * the straight inversion list of the
13549                                         * swash, since we will be accessing it
13550                                         * anyway, to save a little time */
13551                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13552
13553                 if (RExC_parse >= RExC_end)
13554                     vFAIL2("Empty \\%c{}", (U8)value);
13555                 if (*RExC_parse == '{') {
13556                     const U8 c = (U8)value;
13557                     e = strchr(RExC_parse++, '}');
13558                     if (!e)
13559                         vFAIL2("Missing right brace on \\%c{}", c);
13560                     while (isSPACE(*RExC_parse))
13561                         RExC_parse++;
13562                     if (e == RExC_parse)
13563                         vFAIL2("Empty \\%c{}", c);
13564                     n = e - RExC_parse;
13565                     while (isSPACE(*(RExC_parse + n - 1)))
13566                         n--;
13567                 }
13568                 else {
13569                     e = RExC_parse;
13570                     n = 1;
13571                 }
13572                 if (!SIZE_ONLY) {
13573                     SV* invlist;
13574                     char* name;
13575
13576                     if (UCHARAT(RExC_parse) == '^') {
13577                          RExC_parse++;
13578                          n--;
13579                          /* toggle.  (The rhs xor gets the single bit that
13580                           * differs between P and p; the other xor inverts just
13581                           * that bit) */
13582                          value ^= 'P' ^ 'p';
13583
13584                          while (isSPACE(*RExC_parse)) {
13585                               RExC_parse++;
13586                               n--;
13587                          }
13588                     }
13589                     /* Try to get the definition of the property into
13590                      * <invlist>.  If /i is in effect, the effective property
13591                      * will have its name be <__NAME_i>.  The design is
13592                      * discussed in commit
13593                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13594                     name = savepv(Perl_form(aTHX_
13595                                           "%s%.*s%s\n",
13596                                           (FOLD) ? "__" : "",
13597                                           (int)n,
13598                                           RExC_parse,
13599                                           (FOLD) ? "_i" : ""
13600                                 ));
13601
13602                     /* Look up the property name, and get its swash and
13603                      * inversion list, if the property is found  */
13604                     if (swash) {
13605                         SvREFCNT_dec_NN(swash);
13606                     }
13607                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13608                                              1, /* binary */
13609                                              0, /* not tr/// */
13610                                              NULL, /* No inversion list */
13611                                              &swash_init_flags
13612                                             );
13613                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13614                         HV* curpkg = (IN_PERL_COMPILETIME)
13615                                       ? PL_curstash
13616                                       : CopSTASH(PL_curcop);
13617                         if (swash) {
13618                             SvREFCNT_dec_NN(swash);
13619                             swash = NULL;
13620                         }
13621
13622                         /* Here didn't find it.  It could be a user-defined
13623                          * property that will be available at run-time.  If we
13624                          * accept only compile-time properties, is an error;
13625                          * otherwise add it to the list for run-time look up */
13626                         if (ret_invlist) {
13627                             RExC_parse = e + 1;
13628                             vFAIL2utf8f(
13629                                 "Property '%"UTF8f"' is unknown",
13630                                 UTF8fARG(UTF, n, name));
13631                         }
13632
13633                         /* If the property name doesn't already have a package
13634                          * name, add the current one to it so that it can be
13635                          * referred to outside it. [perl #121777] */
13636                         if (curpkg && ! instr(name, "::")) {
13637                             char* pkgname = HvNAME(curpkg);
13638                             if (strNE(pkgname, "main")) {
13639                                 char* full_name = Perl_form(aTHX_
13640                                                             "%s::%s",
13641                                                             pkgname,
13642                                                             name);
13643                                 n = strlen(full_name);
13644                                 Safefree(name);
13645                                 name = savepvn(full_name, n);
13646                             }
13647                         }
13648                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13649                                         (value == 'p' ? '+' : '!'),
13650                                         UTF8fARG(UTF, n, name));
13651                         has_user_defined_property = TRUE;
13652
13653                         /* We don't know yet, so have to assume that the
13654                          * property could match something in the Latin1 range,
13655                          * hence something that isn't utf8.  Note that this
13656                          * would cause things in <depends_list> to match
13657                          * inappropriately, except that any \p{}, including
13658                          * this one forces Unicode semantics, which means there
13659                          * is no <depends_list> */
13660                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13661                     }
13662                     else {
13663
13664                         /* Here, did get the swash and its inversion list.  If
13665                          * the swash is from a user-defined property, then this
13666                          * whole character class should be regarded as such */
13667                         if (swash_init_flags
13668                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13669                         {
13670                             has_user_defined_property = TRUE;
13671                         }
13672                         else if
13673                             /* We warn on matching an above-Unicode code point
13674                              * if the match would return true, except don't
13675                              * warn for \p{All}, which has exactly one element
13676                              * = 0 */
13677                             (_invlist_contains_cp(invlist, 0x110000)
13678                                 && (! (_invlist_len(invlist) == 1
13679                                        && *invlist_array(invlist) == 0)))
13680                         {
13681                             warn_super = TRUE;
13682                         }
13683
13684
13685                         /* Invert if asking for the complement */
13686                         if (value == 'P') {
13687                             _invlist_union_complement_2nd(properties,
13688                                                           invlist,
13689                                                           &properties);
13690
13691                             /* The swash can't be used as-is, because we've
13692                              * inverted things; delay removing it to here after
13693                              * have copied its invlist above */
13694                             SvREFCNT_dec_NN(swash);
13695                             swash = NULL;
13696                         }
13697                         else {
13698                             _invlist_union(properties, invlist, &properties);
13699                         }
13700                     }
13701                     Safefree(name);
13702                 }
13703                 RExC_parse = e + 1;
13704                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13705                                                 named */
13706
13707                 /* \p means they want Unicode semantics */
13708                 RExC_uni_semantics = 1;
13709                 }
13710                 break;
13711             case 'n':   value = '\n';                   break;
13712             case 'r':   value = '\r';                   break;
13713             case 't':   value = '\t';                   break;
13714             case 'f':   value = '\f';                   break;
13715             case 'b':   value = '\b';                   break;
13716             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13717             case 'a':   value = '\a';                   break;
13718             case 'o':
13719                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13720                 {
13721                     const char* error_msg;
13722                     bool valid = grok_bslash_o(&RExC_parse,
13723                                                &value,
13724                                                &error_msg,
13725                                                SIZE_ONLY,   /* warnings in pass
13726                                                                1 only */
13727                                                strict,
13728                                                silence_non_portable,
13729                                                UTF);
13730                     if (! valid) {
13731                         vFAIL(error_msg);
13732                     }
13733                 }
13734                 if (PL_encoding && value < 0x100) {
13735                     goto recode_encoding;
13736                 }
13737                 break;
13738             case 'x':
13739                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13740                 {
13741                     const char* error_msg;
13742                     bool valid = grok_bslash_x(&RExC_parse,
13743                                                &value,
13744                                                &error_msg,
13745                                                TRUE, /* Output warnings */
13746                                                strict,
13747                                                silence_non_portable,
13748                                                UTF);
13749                     if (! valid) {
13750                         vFAIL(error_msg);
13751                     }
13752                 }
13753                 if (PL_encoding && value < 0x100)
13754                     goto recode_encoding;
13755                 break;
13756             case 'c':
13757                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13758                 break;
13759             case '0': case '1': case '2': case '3': case '4':
13760             case '5': case '6': case '7':
13761                 {
13762                     /* Take 1-3 octal digits */
13763                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13764                     numlen = (strict) ? 4 : 3;
13765                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13766                     RExC_parse += numlen;
13767                     if (numlen != 3) {
13768                         if (strict) {
13769                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13770                             vFAIL("Need exactly 3 octal digits");
13771                         }
13772                         else if (! SIZE_ONLY /* like \08, \178 */
13773                                  && numlen < 3
13774                                  && RExC_parse < RExC_end
13775                                  && isDIGIT(*RExC_parse)
13776                                  && ckWARN(WARN_REGEXP))
13777                         {
13778                             SAVEFREESV(RExC_rx_sv);
13779                             reg_warn_non_literal_string(
13780                                  RExC_parse + 1,
13781                                  form_short_octal_warning(RExC_parse, numlen));
13782                             (void)ReREFCNT_inc(RExC_rx_sv);
13783                         }
13784                     }
13785                     if (PL_encoding && value < 0x100)
13786                         goto recode_encoding;
13787                     break;
13788                 }
13789             recode_encoding:
13790                 if (! RExC_override_recoding) {
13791                     SV* enc = PL_encoding;
13792                     value = reg_recode((const char)(U8)value, &enc);
13793                     if (!enc) {
13794                         if (strict) {
13795                             vFAIL("Invalid escape in the specified encoding");
13796                         }
13797                         else if (SIZE_ONLY) {
13798                             ckWARNreg(RExC_parse,
13799                                   "Invalid escape in the specified encoding");
13800                         }
13801                     }
13802                     break;
13803                 }
13804             default:
13805                 /* Allow \_ to not give an error */
13806                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13807                     if (strict) {
13808                         vFAIL2("Unrecognized escape \\%c in character class",
13809                                (int)value);
13810                     }
13811                     else {
13812                         SAVEFREESV(RExC_rx_sv);
13813                         ckWARN2reg(RExC_parse,
13814                             "Unrecognized escape \\%c in character class passed through",
13815                             (int)value);
13816                         (void)ReREFCNT_inc(RExC_rx_sv);
13817                     }
13818                 }
13819                 break;
13820             }   /* End of switch on char following backslash */
13821         } /* end of handling backslash escape sequences */
13822 #ifdef EBCDIC
13823         else
13824             literal_endpoint++;
13825 #endif
13826
13827         /* Here, we have the current token in 'value' */
13828
13829         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13830             U8 classnum;
13831
13832             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13833              * literal, as is the character that began the false range, i.e.
13834              * the 'a' in the examples */
13835             if (range) {
13836                 if (!SIZE_ONLY) {
13837                     const int w = (RExC_parse >= rangebegin)
13838                                   ? RExC_parse - rangebegin
13839                                   : 0;
13840                     if (strict) {
13841                         vFAIL2utf8f(
13842                             "False [] range \"%"UTF8f"\"",
13843                             UTF8fARG(UTF, w, rangebegin));
13844                     }
13845                     else {
13846                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13847                         ckWARN2reg(RExC_parse,
13848                             "False [] range \"%"UTF8f"\"",
13849                             UTF8fARG(UTF, w, rangebegin));
13850                         (void)ReREFCNT_inc(RExC_rx_sv);
13851                         cp_list = add_cp_to_invlist(cp_list, '-');
13852                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13853                                                              prevvalue);
13854                     }
13855                 }
13856
13857                 range = 0; /* this was not a true range */
13858                 element_count += 2; /* So counts for three values */
13859             }
13860
13861             classnum = namedclass_to_classnum(namedclass);
13862
13863             if (LOC && namedclass < ANYOF_POSIXL_MAX
13864 #ifndef HAS_ISASCII
13865                 && classnum != _CC_ASCII
13866 #endif
13867             ) {
13868                 /* What the Posix classes (like \w, [:space:]) match in locale
13869                  * isn't knowable under locale until actual match time.  Room
13870                  * must be reserved (one time per outer bracketed class) to
13871                  * store such classes.  The space will contain a bit for each
13872                  * named class that is to be matched against.  This isn't
13873                  * needed for \p{} and pseudo-classes, as they are not affected
13874                  * by locale, and hence are dealt with separately */
13875                 if (! need_class) {
13876                     need_class = 1;
13877                     if (SIZE_ONLY) {
13878                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13879                     }
13880                     else {
13881                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13882                     }
13883                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13884                     ANYOF_POSIXL_ZERO(ret);
13885                 }
13886
13887                 /* Coverity thinks it is possible for this to be negative; both
13888                  * jhi and khw think it's not, but be safer */
13889                 assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13890                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13891
13892                 /* See if it already matches the complement of this POSIX
13893                  * class */
13894                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13895                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13896                                                             ? -1
13897                                                             : 1)))
13898                 {
13899                     posixl_matches_all = TRUE;
13900                     break;  /* No need to continue.  Since it matches both
13901                                e.g., \w and \W, it matches everything, and the
13902                                bracketed class can be optimized into qr/./s */
13903                 }
13904
13905                 /* Add this class to those that should be checked at runtime */
13906                 ANYOF_POSIXL_SET(ret, namedclass);
13907
13908                 /* The above-Latin1 characters are not subject to locale rules.
13909                  * Just add them, in the second pass, to the
13910                  * unconditionally-matched list */
13911                 if (! SIZE_ONLY) {
13912                     SV* scratch_list = NULL;
13913
13914                     /* Get the list of the above-Latin1 code points this
13915                      * matches */
13916                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13917                                           PL_XPosix_ptrs[classnum],
13918
13919                                           /* Odd numbers are complements, like
13920                                            * NDIGIT, NASCII, ... */
13921                                           namedclass % 2 != 0,
13922                                           &scratch_list);
13923                     /* Checking if 'cp_list' is NULL first saves an extra
13924                      * clone.  Its reference count will be decremented at the
13925                      * next union, etc, or if this is the only instance, at the
13926                      * end of the routine */
13927                     if (! cp_list) {
13928                         cp_list = scratch_list;
13929                     }
13930                     else {
13931                         _invlist_union(cp_list, scratch_list, &cp_list);
13932                         SvREFCNT_dec_NN(scratch_list);
13933                     }
13934                     continue;   /* Go get next character */
13935                 }
13936             }
13937             else if (! SIZE_ONLY) {
13938
13939                 /* Here, not in pass1 (in that pass we skip calculating the
13940                  * contents of this class), and is /l, or is a POSIX class for
13941                  * which /l doesn't matter (or is a Unicode property, which is
13942                  * skipped here). */
13943                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13944                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13945
13946                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13947                          * nor /l make a difference in what these match,
13948                          * therefore we just add what they match to cp_list. */
13949                         if (classnum != _CC_VERTSPACE) {
13950                             assert(   namedclass == ANYOF_HORIZWS
13951                                    || namedclass == ANYOF_NHORIZWS);
13952
13953                             /* It turns out that \h is just a synonym for
13954                              * XPosixBlank */
13955                             classnum = _CC_BLANK;
13956                         }
13957
13958                         _invlist_union_maybe_complement_2nd(
13959                                 cp_list,
13960                                 PL_XPosix_ptrs[classnum],
13961                                 namedclass % 2 != 0,    /* Complement if odd
13962                                                           (NHORIZWS, NVERTWS)
13963                                                         */
13964                                 &cp_list);
13965                     }
13966                 }
13967                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13968                            complement and use nposixes */
13969                     SV** posixes_ptr = namedclass % 2 == 0
13970                                        ? &posixes
13971                                        : &nposixes;
13972                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13973                     _invlist_union_maybe_complement_2nd(
13974                                                      *posixes_ptr,
13975                                                      *source_ptr,
13976                                                      namedclass % 2 != 0,
13977                                                      posixes_ptr);
13978                 }
13979                 continue;   /* Go get next character */
13980             }
13981         } /* end of namedclass \blah */
13982
13983         /* Here, we have a single value.  If 'range' is set, it is the ending
13984          * of a range--check its validity.  Later, we will handle each
13985          * individual code point in the range.  If 'range' isn't set, this
13986          * could be the beginning of a range, so check for that by looking
13987          * ahead to see if the next real character to be processed is the range
13988          * indicator--the minus sign */
13989
13990         if (skip_white) {
13991             RExC_parse = regpatws(pRExC_state, RExC_parse,
13992                                 FALSE /* means don't recognize comments */ );
13993         }
13994
13995         if (range) {
13996             if (prevvalue > value) /* b-a */ {
13997                 const int w = RExC_parse - rangebegin;
13998                 vFAIL2utf8f(
13999                     "Invalid [] range \"%"UTF8f"\"",
14000                     UTF8fARG(UTF, w, rangebegin));
14001                 range = 0; /* not a valid range */
14002             }
14003         }
14004         else {
14005             prevvalue = value; /* save the beginning of the potential range */
14006             if (! stop_at_1     /* Can't be a range if parsing just one thing */
14007                 && *RExC_parse == '-')
14008             {
14009                 char* next_char_ptr = RExC_parse + 1;
14010                 if (skip_white) {   /* Get the next real char after the '-' */
14011                     next_char_ptr = regpatws(pRExC_state,
14012                                              RExC_parse + 1,
14013                                              FALSE); /* means don't recognize
14014                                                         comments */
14015                 }
14016
14017                 /* If the '-' is at the end of the class (just before the ']',
14018                  * it is a literal minus; otherwise it is a range */
14019                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14020                     RExC_parse = next_char_ptr;
14021
14022                     /* a bad range like \w-, [:word:]- ? */
14023                     if (namedclass > OOB_NAMEDCLASS) {
14024                         if (strict || ckWARN(WARN_REGEXP)) {
14025                             const int w =
14026                                 RExC_parse >= rangebegin ?
14027                                 RExC_parse - rangebegin : 0;
14028                             if (strict) {
14029                                 vFAIL4("False [] range \"%*.*s\"",
14030                                     w, w, rangebegin);
14031                             }
14032                             else {
14033                                 vWARN4(RExC_parse,
14034                                     "False [] range \"%*.*s\"",
14035                                     w, w, rangebegin);
14036                             }
14037                         }
14038                         if (!SIZE_ONLY) {
14039                             cp_list = add_cp_to_invlist(cp_list, '-');
14040                         }
14041                         element_count++;
14042                     } else
14043                         range = 1;      /* yeah, it's a range! */
14044                     continue;   /* but do it the next time */
14045                 }
14046             }
14047         }
14048
14049         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14050          * if not */
14051
14052         /* non-Latin1 code point implies unicode semantics.  Must be set in
14053          * pass1 so is there for the whole of pass 2 */
14054         if (value > 255) {
14055             RExC_uni_semantics = 1;
14056         }
14057
14058         /* Ready to process either the single value, or the completed range.
14059          * For single-valued non-inverted ranges, we consider the possibility
14060          * of multi-char folds.  (We made a conscious decision to not do this
14061          * for the other cases because it can often lead to non-intuitive
14062          * results.  For example, you have the peculiar case that:
14063          *  "s s" =~ /^[^\xDF]+$/i => Y
14064          *  "ss"  =~ /^[^\xDF]+$/i => N
14065          *
14066          * See [perl #89750] */
14067         if (FOLD && allow_multi_folds && value == prevvalue) {
14068             if (value == LATIN_SMALL_LETTER_SHARP_S
14069                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14070                                                         value)))
14071             {
14072                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14073
14074                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14075                 STRLEN foldlen;
14076
14077                 UV folded = _to_uni_fold_flags(
14078                                 value,
14079                                 foldbuf,
14080                                 &foldlen,
14081                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14082                                                    ? FOLD_FLAGS_NOMIX_ASCII
14083                                                    : 0)
14084                                 );
14085
14086                 /* Here, <folded> should be the first character of the
14087                  * multi-char fold of <value>, with <foldbuf> containing the
14088                  * whole thing.  But, if this fold is not allowed (because of
14089                  * the flags), <fold> will be the same as <value>, and should
14090                  * be processed like any other character, so skip the special
14091                  * handling */
14092                 if (folded != value) {
14093
14094                     /* Skip if we are recursed, currently parsing the class
14095                      * again.  Otherwise add this character to the list of
14096                      * multi-char folds. */
14097                     if (! RExC_in_multi_char_class) {
14098                         AV** this_array_ptr;
14099                         AV* this_array;
14100                         STRLEN cp_count = utf8_length(foldbuf,
14101                                                       foldbuf + foldlen);
14102                         SV* multi_fold = sv_2mortal(newSVpvs(""));
14103
14104                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14105
14106
14107                         if (! multi_char_matches) {
14108                             multi_char_matches = newAV();
14109                         }
14110
14111                         /* <multi_char_matches> is actually an array of arrays.
14112                          * There will be one or two top-level elements: [2],
14113                          * and/or [3].  The [2] element is an array, each
14114                          * element thereof is a character which folds to TWO
14115                          * characters; [3] is for folds to THREE characters.
14116                          * (Unicode guarantees a maximum of 3 characters in any
14117                          * fold.)  When we rewrite the character class below,
14118                          * we will do so such that the longest folds are
14119                          * written first, so that it prefers the longest
14120                          * matching strings first.  This is done even if it
14121                          * turns out that any quantifier is non-greedy, out of
14122                          * programmer laziness.  Tom Christiansen has agreed
14123                          * that this is ok.  This makes the test for the
14124                          * ligature 'ffi' come before the test for 'ff' */
14125                         if (av_exists(multi_char_matches, cp_count)) {
14126                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
14127                                                              cp_count, FALSE);
14128                             this_array = *this_array_ptr;
14129                         }
14130                         else {
14131                             this_array = newAV();
14132                             av_store(multi_char_matches, cp_count,
14133                                      (SV*) this_array);
14134                         }
14135                         av_push(this_array, multi_fold);
14136                     }
14137
14138                     /* This element should not be processed further in this
14139                      * class */
14140                     element_count--;
14141                     value = save_value;
14142                     prevvalue = save_prevvalue;
14143                     continue;
14144                 }
14145             }
14146         }
14147
14148         /* Deal with this element of the class */
14149         if (! SIZE_ONLY) {
14150 #ifndef EBCDIC
14151             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14152                                                      prevvalue, value);
14153 #else
14154             SV* this_range = _new_invlist(1);
14155             _append_range_to_invlist(this_range, prevvalue, value);
14156
14157             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14158              * If this range was specified using something like 'i-j', we want
14159              * to include only the 'i' and the 'j', and not anything in
14160              * between, so exclude non-ASCII, non-alphabetics from it.
14161              * However, if the range was specified with something like
14162              * [\x89-\x91] or [\x89-j], all code points within it should be
14163              * included.  literal_endpoint==2 means both ends of the range used
14164              * a literal character, not \x{foo} */
14165             if (literal_endpoint == 2
14166                 && ((prevvalue >= 'a' && value <= 'z')
14167                     || (prevvalue >= 'A' && value <= 'Z')))
14168             {
14169                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14170                                       &this_range);
14171
14172                 /* Since this above only contains ascii, the intersection of it
14173                  * with anything will still yield only ascii */
14174                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14175                                       &this_range);
14176             }
14177             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14178             literal_endpoint = 0;
14179 #endif
14180         }
14181
14182         range = 0; /* this range (if it was one) is done now */
14183     } /* End of loop through all the text within the brackets */
14184
14185     /* If anything in the class expands to more than one character, we have to
14186      * deal with them by building up a substitute parse string, and recursively
14187      * calling reg() on it, instead of proceeding */
14188     if (multi_char_matches) {
14189         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14190         I32 cp_count;
14191         STRLEN len;
14192         char *save_end = RExC_end;
14193         char *save_parse = RExC_parse;
14194         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14195                                        a "|" */
14196         I32 reg_flags;
14197
14198         assert(! invert);
14199 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14200            because too confusing */
14201         if (invert) {
14202             sv_catpv(substitute_parse, "(?:");
14203         }
14204 #endif
14205
14206         /* Look at the longest folds first */
14207         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14208
14209             if (av_exists(multi_char_matches, cp_count)) {
14210                 AV** this_array_ptr;
14211                 SV* this_sequence;
14212
14213                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14214                                                  cp_count, FALSE);
14215                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14216                                                                 &PL_sv_undef)
14217                 {
14218                     if (! first_time) {
14219                         sv_catpv(substitute_parse, "|");
14220                     }
14221                     first_time = FALSE;
14222
14223                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14224                 }
14225             }
14226         }
14227
14228         /* If the character class contains anything else besides these
14229          * multi-character folds, have to include it in recursive parsing */
14230         if (element_count) {
14231             sv_catpv(substitute_parse, "|[");
14232             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14233             sv_catpv(substitute_parse, "]");
14234         }
14235
14236         sv_catpv(substitute_parse, ")");
14237 #if 0
14238         if (invert) {
14239             /* This is a way to get the parse to skip forward a whole named
14240              * sequence instead of matching the 2nd character when it fails the
14241              * first */
14242             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14243         }
14244 #endif
14245
14246         RExC_parse = SvPV(substitute_parse, len);
14247         RExC_end = RExC_parse + len;
14248         RExC_in_multi_char_class = 1;
14249         RExC_emit = (regnode *)orig_emit;
14250
14251         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14252
14253         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14254
14255         RExC_parse = save_parse;
14256         RExC_end = save_end;
14257         RExC_in_multi_char_class = 0;
14258         SvREFCNT_dec_NN(multi_char_matches);
14259         return ret;
14260     }
14261
14262     /* Here, we've gone through the entire class and dealt with multi-char
14263      * folds.  We are now in a position that we can do some checks to see if we
14264      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14265      * Currently we only do two checks:
14266      * 1) is in the unlikely event that the user has specified both, eg. \w and
14267      *    \W under /l, then the class matches everything.  (This optimization
14268      *    is done only to make the optimizer code run later work.)
14269      * 2) if the character class contains only a single element (including a
14270      *    single range), we see if there is an equivalent node for it.
14271      * Other checks are possible */
14272     if (! ret_invlist   /* Can't optimize if returning the constructed
14273                            inversion list */
14274         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14275     {
14276         U8 op = END;
14277         U8 arg = 0;
14278
14279         if (UNLIKELY(posixl_matches_all)) {
14280             op = SANY;
14281         }
14282         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14283                                                    \w or [:digit:] or \p{foo}
14284                                                  */
14285
14286             /* All named classes are mapped into POSIXish nodes, with its FLAG
14287              * argument giving which class it is */
14288             switch ((I32)namedclass) {
14289                 case ANYOF_UNIPROP:
14290                     break;
14291
14292                 /* These don't depend on the charset modifiers.  They always
14293                  * match under /u rules */
14294                 case ANYOF_NHORIZWS:
14295                 case ANYOF_HORIZWS:
14296                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14297                     /* FALLTHROUGH */
14298
14299                 case ANYOF_NVERTWS:
14300                 case ANYOF_VERTWS:
14301                     op = POSIXU;
14302                     goto join_posix;
14303
14304                 /* The actual POSIXish node for all the rest depends on the
14305                  * charset modifier.  The ones in the first set depend only on
14306                  * ASCII or, if available on this platform, locale */
14307                 case ANYOF_ASCII:
14308                 case ANYOF_NASCII:
14309 #ifdef HAS_ISASCII
14310                     op = (LOC) ? POSIXL : POSIXA;
14311 #else
14312                     op = POSIXA;
14313 #endif
14314                     goto join_posix;
14315
14316                 case ANYOF_NCASED:
14317                 case ANYOF_LOWER:
14318                 case ANYOF_NLOWER:
14319                 case ANYOF_UPPER:
14320                 case ANYOF_NUPPER:
14321                     /* under /a could be alpha */
14322                     if (FOLD) {
14323                         if (ASCII_RESTRICTED) {
14324                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14325                         }
14326                         else if (! LOC) {
14327                             break;
14328                         }
14329                     }
14330                     /* FALLTHROUGH */
14331
14332                 /* The rest have more possibilities depending on the charset.
14333                  * We take advantage of the enum ordering of the charset
14334                  * modifiers to get the exact node type, */
14335                 default:
14336                     op = POSIXD + get_regex_charset(RExC_flags);
14337                     if (op > POSIXA) { /* /aa is same as /a */
14338                         op = POSIXA;
14339                     }
14340
14341                 join_posix:
14342                     /* The odd numbered ones are the complements of the
14343                      * next-lower even number one */
14344                     if (namedclass % 2 == 1) {
14345                         invert = ! invert;
14346                         namedclass--;
14347                     }
14348                     arg = namedclass_to_classnum(namedclass);
14349                     break;
14350             }
14351         }
14352         else if (value == prevvalue) {
14353
14354             /* Here, the class consists of just a single code point */
14355
14356             if (invert) {
14357                 if (! LOC && value == '\n') {
14358                     op = REG_ANY; /* Optimize [^\n] */
14359                     *flagp |= HASWIDTH|SIMPLE;
14360                     RExC_naughty++;
14361                 }
14362             }
14363             else if (value < 256 || UTF) {
14364
14365                 /* Optimize a single value into an EXACTish node, but not if it
14366                  * would require converting the pattern to UTF-8. */
14367                 op = compute_EXACTish(pRExC_state);
14368             }
14369         } /* Otherwise is a range */
14370         else if (! LOC) {   /* locale could vary these */
14371             if (prevvalue == '0') {
14372                 if (value == '9') {
14373                     arg = _CC_DIGIT;
14374                     op = POSIXA;
14375                 }
14376             }
14377             else if (prevvalue == 'A') {
14378                 if (value == 'Z'
14379 #ifdef EBCDIC
14380                     && literal_endpoint == 2
14381 #endif
14382                 ) {
14383                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14384                     op = POSIXA;
14385                 }
14386             }
14387             else if (prevvalue == 'a') {
14388                 if (value == 'z'
14389 #ifdef EBCDIC
14390                     && literal_endpoint == 2
14391 #endif
14392                 ) {
14393                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14394                     op = POSIXA;
14395                 }
14396             }
14397         }
14398
14399         /* Here, we have changed <op> away from its initial value iff we found
14400          * an optimization */
14401         if (op != END) {
14402
14403             /* Throw away this ANYOF regnode, and emit the calculated one,
14404              * which should correspond to the beginning, not current, state of
14405              * the parse */
14406             const char * cur_parse = RExC_parse;
14407             RExC_parse = (char *)orig_parse;
14408             if ( SIZE_ONLY) {
14409                 if (! LOC) {
14410
14411                     /* To get locale nodes to not use the full ANYOF size would
14412                      * require moving the code above that writes the portions
14413                      * of it that aren't in other nodes to after this point.
14414                      * e.g.  ANYOF_POSIXL_SET */
14415                     RExC_size = orig_size;
14416                 }
14417             }
14418             else {
14419                 RExC_emit = (regnode *)orig_emit;
14420                 if (PL_regkind[op] == POSIXD) {
14421                     if (op == POSIXL) {
14422                         RExC_contains_locale = 1;
14423                     }
14424                     if (invert) {
14425                         op += NPOSIXD - POSIXD;
14426                     }
14427                 }
14428             }
14429
14430             ret = reg_node(pRExC_state, op);
14431
14432             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14433                 if (! SIZE_ONLY) {
14434                     FLAGS(ret) = arg;
14435                 }
14436                 *flagp |= HASWIDTH|SIMPLE;
14437             }
14438             else if (PL_regkind[op] == EXACT) {
14439                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14440                                            TRUE /* downgradable to EXACT */
14441                                            );
14442             }
14443
14444             RExC_parse = (char *) cur_parse;
14445
14446             SvREFCNT_dec(posixes);
14447             SvREFCNT_dec(nposixes);
14448             SvREFCNT_dec(cp_list);
14449             SvREFCNT_dec(cp_foldable_list);
14450             return ret;
14451         }
14452     }
14453
14454     if (SIZE_ONLY)
14455         return ret;
14456     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14457
14458     /* If folding, we calculate all characters that could fold to or from the
14459      * ones already on the list */
14460     if (cp_foldable_list) {
14461         if (FOLD) {
14462             UV start, end;      /* End points of code point ranges */
14463
14464             SV* fold_intersection = NULL;
14465             SV** use_list;
14466
14467             /* Our calculated list will be for Unicode rules.  For locale
14468              * matching, we have to keep a separate list that is consulted at
14469              * runtime only when the locale indicates Unicode rules.  For
14470              * non-locale, we just use to the general list */
14471             if (LOC) {
14472                 use_list = &only_utf8_locale_list;
14473             }
14474             else {
14475                 use_list = &cp_list;
14476             }
14477
14478             /* Only the characters in this class that participate in folds need
14479              * be checked.  Get the intersection of this class and all the
14480              * possible characters that are foldable.  This can quickly narrow
14481              * down a large class */
14482             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14483                                   &fold_intersection);
14484
14485             /* The folds for all the Latin1 characters are hard-coded into this
14486              * program, but we have to go out to disk to get the others. */
14487             if (invlist_highest(cp_foldable_list) >= 256) {
14488
14489                 /* This is a hash that for a particular fold gives all
14490                  * characters that are involved in it */
14491                 if (! PL_utf8_foldclosures) {
14492                     _load_PL_utf8_foldclosures();
14493                 }
14494             }
14495
14496             /* Now look at the foldable characters in this class individually */
14497             invlist_iterinit(fold_intersection);
14498             while (invlist_iternext(fold_intersection, &start, &end)) {
14499                 UV j;
14500
14501                 /* Look at every character in the range */
14502                 for (j = start; j <= end; j++) {
14503                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14504                     STRLEN foldlen;
14505                     SV** listp;
14506
14507                     if (j < 256) {
14508
14509                         if (IS_IN_SOME_FOLD_L1(j)) {
14510
14511                             /* ASCII is always matched; non-ASCII is matched
14512                              * only under Unicode rules (which could happen
14513                              * under /l if the locale is a UTF-8 one */
14514                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14515                                 *use_list = add_cp_to_invlist(*use_list,
14516                                                             PL_fold_latin1[j]);
14517                             }
14518                             else {
14519                                 depends_list =
14520                                  add_cp_to_invlist(depends_list,
14521                                                    PL_fold_latin1[j]);
14522                             }
14523                         }
14524
14525                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14526                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14527                         {
14528                             add_above_Latin1_folds(pRExC_state,
14529                                                    (U8) j,
14530                                                    use_list);
14531                         }
14532                         continue;
14533                     }
14534
14535                     /* Here is an above Latin1 character.  We don't have the
14536                      * rules hard-coded for it.  First, get its fold.  This is
14537                      * the simple fold, as the multi-character folds have been
14538                      * handled earlier and separated out */
14539                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14540                                                         (ASCII_FOLD_RESTRICTED)
14541                                                         ? FOLD_FLAGS_NOMIX_ASCII
14542                                                         : 0);
14543
14544                     /* Single character fold of above Latin1.  Add everything in
14545                     * its fold closure to the list that this node should match.
14546                     * The fold closures data structure is a hash with the keys
14547                     * being the UTF-8 of every character that is folded to, like
14548                     * 'k', and the values each an array of all code points that
14549                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14550                     * Multi-character folds are not included */
14551                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14552                                         (char *) foldbuf, foldlen, FALSE)))
14553                     {
14554                         AV* list = (AV*) *listp;
14555                         IV k;
14556                         for (k = 0; k <= av_tindex(list); k++) {
14557                             SV** c_p = av_fetch(list, k, FALSE);
14558                             UV c;
14559                             assert(c_p);
14560
14561                             c = SvUV(*c_p);
14562
14563                             /* /aa doesn't allow folds between ASCII and non- */
14564                             if ((ASCII_FOLD_RESTRICTED
14565                                 && (isASCII(c) != isASCII(j))))
14566                             {
14567                                 continue;
14568                             }
14569
14570                             /* Folds under /l which cross the 255/256 boundary
14571                              * are added to a separate list.  (These are valid
14572                              * only when the locale is UTF-8.) */
14573                             if (c < 256 && LOC) {
14574                                 *use_list = add_cp_to_invlist(*use_list, c);
14575                                 continue;
14576                             }
14577
14578                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14579                             {
14580                                 cp_list = add_cp_to_invlist(cp_list, c);
14581                             }
14582                             else {
14583                                 /* Similarly folds involving non-ascii Latin1
14584                                 * characters under /d are added to their list */
14585                                 depends_list = add_cp_to_invlist(depends_list,
14586                                                                  c);
14587                             }
14588                         }
14589                     }
14590                 }
14591             }
14592             SvREFCNT_dec_NN(fold_intersection);
14593         }
14594
14595         /* Now that we have finished adding all the folds, there is no reason
14596          * to keep the foldable list separate */
14597         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14598         SvREFCNT_dec_NN(cp_foldable_list);
14599     }
14600
14601     /* And combine the result (if any) with any inversion list from posix
14602      * classes.  The lists are kept separate up to now because we don't want to
14603      * fold the classes (folding of those is automatically handled by the swash
14604      * fetching code) */
14605     if (posixes || nposixes) {
14606         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14607             /* Under /a and /aa, nothing above ASCII matches these */
14608             _invlist_intersection(posixes,
14609                                   PL_XPosix_ptrs[_CC_ASCII],
14610                                   &posixes);
14611         }
14612         if (nposixes) {
14613             if (DEPENDS_SEMANTICS) {
14614                 /* Under /d, everything in the upper half of the Latin1 range
14615                  * matches these complements */
14616                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14617             }
14618             else if (AT_LEAST_ASCII_RESTRICTED) {
14619                 /* Under /a and /aa, everything above ASCII matches these
14620                  * complements */
14621                 _invlist_union_complement_2nd(nposixes,
14622                                               PL_XPosix_ptrs[_CC_ASCII],
14623                                               &nposixes);
14624             }
14625             if (posixes) {
14626                 _invlist_union(posixes, nposixes, &posixes);
14627                 SvREFCNT_dec_NN(nposixes);
14628             }
14629             else {
14630                 posixes = nposixes;
14631             }
14632         }
14633         if (! DEPENDS_SEMANTICS) {
14634             if (cp_list) {
14635                 _invlist_union(cp_list, posixes, &cp_list);
14636                 SvREFCNT_dec_NN(posixes);
14637             }
14638             else {
14639                 cp_list = posixes;
14640             }
14641         }
14642         else {
14643             /* Under /d, we put into a separate list the Latin1 things that
14644              * match only when the target string is utf8 */
14645             SV* nonascii_but_latin1_properties = NULL;
14646             _invlist_intersection(posixes, PL_UpperLatin1,
14647                                   &nonascii_but_latin1_properties);
14648             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14649                               &posixes);
14650             if (cp_list) {
14651                 _invlist_union(cp_list, posixes, &cp_list);
14652                 SvREFCNT_dec_NN(posixes);
14653             }
14654             else {
14655                 cp_list = posixes;
14656             }
14657
14658             if (depends_list) {
14659                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14660                                &depends_list);
14661                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14662             }
14663             else {
14664                 depends_list = nonascii_but_latin1_properties;
14665             }
14666         }
14667     }
14668
14669     /* And combine the result (if any) with any inversion list from properties.
14670      * The lists are kept separate up to now so that we can distinguish the two
14671      * in regards to matching above-Unicode.  A run-time warning is generated
14672      * if a Unicode property is matched against a non-Unicode code point. But,
14673      * we allow user-defined properties to match anything, without any warning,
14674      * and we also suppress the warning if there is a portion of the character
14675      * class that isn't a Unicode property, and which matches above Unicode, \W
14676      * or [\x{110000}] for example.
14677      * (Note that in this case, unlike the Posix one above, there is no
14678      * <depends_list>, because having a Unicode property forces Unicode
14679      * semantics */
14680     if (properties) {
14681         if (cp_list) {
14682
14683             /* If it matters to the final outcome, see if a non-property
14684              * component of the class matches above Unicode.  If so, the
14685              * warning gets suppressed.  This is true even if just a single
14686              * such code point is specified, as though not strictly correct if
14687              * another such code point is matched against, the fact that they
14688              * are using above-Unicode code points indicates they should know
14689              * the issues involved */
14690             if (warn_super) {
14691                 warn_super = ! (invert
14692                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14693             }
14694
14695             _invlist_union(properties, cp_list, &cp_list);
14696             SvREFCNT_dec_NN(properties);
14697         }
14698         else {
14699             cp_list = properties;
14700         }
14701
14702         if (warn_super) {
14703             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14704         }
14705     }
14706
14707     /* Here, we have calculated what code points should be in the character
14708      * class.
14709      *
14710      * Now we can see about various optimizations.  Fold calculation (which we
14711      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14712      * would invert to include K, which under /i would match k, which it
14713      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14714      * folded until runtime */
14715
14716     /* If we didn't do folding, it's because some information isn't available
14717      * until runtime; set the run-time fold flag for these.  (We don't have to
14718      * worry about properties folding, as that is taken care of by the swash
14719      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14720      * locales, or the class matches at least one 0-255 range code point */
14721     if (LOC && FOLD) {
14722         if (only_utf8_locale_list) {
14723             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14724         }
14725         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14726                                the list */
14727             UV start, end;
14728             invlist_iterinit(cp_list);
14729             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14730                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14731             }
14732             invlist_iterfinish(cp_list);
14733         }
14734     }
14735
14736     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14737      * at compile time.  Besides not inverting folded locale now, we can't
14738      * invert if there are things such as \w, which aren't known until runtime
14739      * */
14740     if (cp_list
14741         && invert
14742         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14743         && ! depends_list
14744         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14745     {
14746         _invlist_invert(cp_list);
14747
14748         /* Any swash can't be used as-is, because we've inverted things */
14749         if (swash) {
14750             SvREFCNT_dec_NN(swash);
14751             swash = NULL;
14752         }
14753
14754         /* Clear the invert flag since have just done it here */
14755         invert = FALSE;
14756     }
14757
14758     if (ret_invlist) {
14759         *ret_invlist = cp_list;
14760         SvREFCNT_dec(swash);
14761
14762         /* Discard the generated node */
14763         if (SIZE_ONLY) {
14764             RExC_size = orig_size;
14765         }
14766         else {
14767             RExC_emit = orig_emit;
14768         }
14769         return orig_emit;
14770     }
14771
14772     /* Some character classes are equivalent to other nodes.  Such nodes take
14773      * up less room and generally fewer operations to execute than ANYOF nodes.
14774      * Above, we checked for and optimized into some such equivalents for
14775      * certain common classes that are easy to test.  Getting to this point in
14776      * the code means that the class didn't get optimized there.  Since this
14777      * code is only executed in Pass 2, it is too late to save space--it has
14778      * been allocated in Pass 1, and currently isn't given back.  But turning
14779      * things into an EXACTish node can allow the optimizer to join it to any
14780      * adjacent such nodes.  And if the class is equivalent to things like /./,
14781      * expensive run-time swashes can be avoided.  Now that we have more
14782      * complete information, we can find things necessarily missed by the
14783      * earlier code.  I (khw) am not sure how much to look for here.  It would
14784      * be easy, but perhaps too slow, to check any candidates against all the
14785      * node types they could possibly match using _invlistEQ(). */
14786
14787     if (cp_list
14788         && ! invert
14789         && ! depends_list
14790         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14791         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14792
14793            /* We don't optimize if we are supposed to make sure all non-Unicode
14794             * code points raise a warning, as only ANYOF nodes have this check.
14795             * */
14796         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14797     {
14798         UV start, end;
14799         U8 op = END;  /* The optimzation node-type */
14800         const char * cur_parse= RExC_parse;
14801
14802         invlist_iterinit(cp_list);
14803         if (! invlist_iternext(cp_list, &start, &end)) {
14804
14805             /* Here, the list is empty.  This happens, for example, when a
14806              * Unicode property is the only thing in the character class, and
14807              * it doesn't match anything.  (perluniprops.pod notes such
14808              * properties) */
14809             op = OPFAIL;
14810             *flagp |= HASWIDTH|SIMPLE;
14811         }
14812         else if (start == end) {    /* The range is a single code point */
14813             if (! invlist_iternext(cp_list, &start, &end)
14814
14815                     /* Don't do this optimization if it would require changing
14816                      * the pattern to UTF-8 */
14817                 && (start < 256 || UTF))
14818             {
14819                 /* Here, the list contains a single code point.  Can optimize
14820                  * into an EXACTish node */
14821
14822                 value = start;
14823
14824                 if (! FOLD) {
14825                     op = EXACT;
14826                 }
14827                 else if (LOC) {
14828
14829                     /* A locale node under folding with one code point can be
14830                      * an EXACTFL, as its fold won't be calculated until
14831                      * runtime */
14832                     op = EXACTFL;
14833                 }
14834                 else {
14835
14836                     /* Here, we are generally folding, but there is only one
14837                      * code point to match.  If we have to, we use an EXACT
14838                      * node, but it would be better for joining with adjacent
14839                      * nodes in the optimization pass if we used the same
14840                      * EXACTFish node that any such are likely to be.  We can
14841                      * do this iff the code point doesn't participate in any
14842                      * folds.  For example, an EXACTF of a colon is the same as
14843                      * an EXACT one, since nothing folds to or from a colon. */
14844                     if (value < 256) {
14845                         if (IS_IN_SOME_FOLD_L1(value)) {
14846                             op = EXACT;
14847                         }
14848                     }
14849                     else {
14850                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14851                             op = EXACT;
14852                         }
14853                     }
14854
14855                     /* If we haven't found the node type, above, it means we
14856                      * can use the prevailing one */
14857                     if (op == END) {
14858                         op = compute_EXACTish(pRExC_state);
14859                     }
14860                 }
14861             }
14862         }
14863         else if (start == 0) {
14864             if (end == UV_MAX) {
14865                 op = SANY;
14866                 *flagp |= HASWIDTH|SIMPLE;
14867                 RExC_naughty++;
14868             }
14869             else if (end == '\n' - 1
14870                     && invlist_iternext(cp_list, &start, &end)
14871                     && start == '\n' + 1 && end == UV_MAX)
14872             {
14873                 op = REG_ANY;
14874                 *flagp |= HASWIDTH|SIMPLE;
14875                 RExC_naughty++;
14876             }
14877         }
14878         invlist_iterfinish(cp_list);
14879
14880         if (op != END) {
14881             RExC_parse = (char *)orig_parse;
14882             RExC_emit = (regnode *)orig_emit;
14883
14884             ret = reg_node(pRExC_state, op);
14885
14886             RExC_parse = (char *)cur_parse;
14887
14888             if (PL_regkind[op] == EXACT) {
14889                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14890                                            TRUE /* downgradable to EXACT */
14891                                           );
14892             }
14893
14894             SvREFCNT_dec_NN(cp_list);
14895             return ret;
14896         }
14897     }
14898
14899     /* Here, <cp_list> contains all the code points we can determine at
14900      * compile time that match under all conditions.  Go through it, and
14901      * for things that belong in the bitmap, put them there, and delete from
14902      * <cp_list>.  While we are at it, see if everything above 255 is in the
14903      * list, and if so, set a flag to speed up execution */
14904
14905     populate_ANYOF_from_invlist(ret, &cp_list);
14906
14907     if (invert) {
14908         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14909     }
14910
14911     /* Here, the bitmap has been populated with all the Latin1 code points that
14912      * always match.  Can now add to the overall list those that match only
14913      * when the target string is UTF-8 (<depends_list>). */
14914     if (depends_list) {
14915         if (cp_list) {
14916             _invlist_union(cp_list, depends_list, &cp_list);
14917             SvREFCNT_dec_NN(depends_list);
14918         }
14919         else {
14920             cp_list = depends_list;
14921         }
14922         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14923     }
14924
14925     /* If there is a swash and more than one element, we can't use the swash in
14926      * the optimization below. */
14927     if (swash && element_count > 1) {
14928         SvREFCNT_dec_NN(swash);
14929         swash = NULL;
14930     }
14931
14932     set_ANYOF_arg(pRExC_state, ret, cp_list,
14933                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14934                    ? listsv : NULL,
14935                   only_utf8_locale_list,
14936                   swash, has_user_defined_property);
14937
14938     *flagp |= HASWIDTH|SIMPLE;
14939
14940     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14941         RExC_contains_locale = 1;
14942     }
14943
14944     return ret;
14945 }
14946
14947 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14948
14949 STATIC void
14950 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14951                 regnode* const node,
14952                 SV* const cp_list,
14953                 SV* const runtime_defns,
14954                 SV* const only_utf8_locale_list,
14955                 SV* const swash,
14956                 const bool has_user_defined_property)
14957 {
14958     /* Sets the arg field of an ANYOF-type node 'node', using information about
14959      * the node passed-in.  If there is nothing outside the node's bitmap, the
14960      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14961      * the count returned by add_data(), having allocated and stored an array,
14962      * av, that that count references, as follows:
14963      *  av[0] stores the character class description in its textual form.
14964      *        This is used later (regexec.c:Perl_regclass_swash()) to
14965      *        initialize the appropriate swash, and is also useful for dumping
14966      *        the regnode.  This is set to &PL_sv_undef if the textual
14967      *        description is not needed at run-time (as happens if the other
14968      *        elements completely define the class)
14969      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14970      *        computed from av[0].  But if no further computation need be done,
14971      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14972      *  av[2] stores the inversion list of code points that match only if the
14973      *        current locale is UTF-8
14974      *  av[3] stores the cp_list inversion list for use in addition or instead
14975      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14976      *        (Otherwise everything needed is already in av[0] and av[1])
14977      *  av[4] is set if any component of the class is from a user-defined
14978      *        property; used only if av[3] exists */
14979
14980     UV n;
14981
14982     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14983
14984     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14985         assert(! (ANYOF_FLAGS(node)
14986                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14987         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14988     }
14989     else {
14990         AV * const av = newAV();
14991         SV *rv;
14992
14993         assert(ANYOF_FLAGS(node)
14994                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14995
14996         av_store(av, 0, (runtime_defns)
14997                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14998         if (swash) {
14999             assert(cp_list);
15000             av_store(av, 1, swash);
15001             SvREFCNT_dec_NN(cp_list);
15002         }
15003         else {
15004             av_store(av, 1, &PL_sv_undef);
15005             if (cp_list) {
15006                 av_store(av, 3, cp_list);
15007                 av_store(av, 4, newSVuv(has_user_defined_property));
15008             }
15009         }
15010
15011         if (only_utf8_locale_list) {
15012             av_store(av, 2, only_utf8_locale_list);
15013         }
15014         else {
15015             av_store(av, 2, &PL_sv_undef);
15016         }
15017
15018         rv = newRV_noinc(MUTABLE_SV(av));
15019         n = add_data(pRExC_state, STR_WITH_LEN("s"));
15020         RExC_rxi->data->data[n] = (void*)rv;
15021         ARG_SET(node, n);
15022     }
15023 }
15024
15025
15026 /* reg_skipcomment()
15027
15028    Absorbs an /x style # comment from the input stream,
15029    returning a pointer to the first character beyond the comment, or if the
15030    comment terminates the pattern without anything following it, this returns
15031    one past the final character of the pattern (in other words, RExC_end) and
15032    sets the REG_RUN_ON_COMMENT_SEEN flag.
15033
15034    Note it's the callers responsibility to ensure that we are
15035    actually in /x mode
15036
15037 */
15038
15039 PERL_STATIC_INLINE char*
15040 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15041 {
15042     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15043
15044     assert(*p = '#');
15045
15046     while (p < RExC_end) {
15047         if (*(++p) == '\n') {
15048             return p+1;
15049         }
15050     }
15051
15052     /* we ran off the end of the pattern without ending the comment, so we have
15053      * to add an \n when wrapping */
15054     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15055     return p;
15056 }
15057
15058 /* nextchar()
15059
15060    Advances the parse position, and optionally absorbs
15061    "whitespace" from the inputstream.
15062
15063    Without /x "whitespace" means (?#...) style comments only,
15064    with /x this means (?#...) and # comments and whitespace proper.
15065
15066    Returns the RExC_parse point from BEFORE the scan occurs.
15067
15068    This is the /x friendly way of saying RExC_parse++.
15069 */
15070
15071 STATIC char*
15072 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15073 {
15074     char* const retval = RExC_parse++;
15075
15076     PERL_ARGS_ASSERT_NEXTCHAR;
15077
15078     for (;;) {
15079         if (RExC_end - RExC_parse >= 3
15080             && *RExC_parse == '('
15081             && RExC_parse[1] == '?'
15082             && RExC_parse[2] == '#')
15083         {
15084             while (*RExC_parse != ')') {
15085                 if (RExC_parse == RExC_end)
15086                     FAIL("Sequence (?#... not terminated");
15087                 RExC_parse++;
15088             }
15089             RExC_parse++;
15090             continue;
15091         }
15092         if (RExC_flags & RXf_PMf_EXTENDED) {
15093             char * p = regpatws(pRExC_state, RExC_parse,
15094                                           TRUE); /* means recognize comments */
15095             if (p != RExC_parse) {
15096                 RExC_parse = p;
15097                 continue;
15098             }
15099         }
15100         return retval;
15101     }
15102 }
15103
15104 /*
15105 - reg_node - emit a node
15106 */
15107 STATIC regnode *                        /* Location. */
15108 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15109 {
15110     dVAR;
15111     regnode *ptr;
15112     regnode * const ret = RExC_emit;
15113     GET_RE_DEBUG_FLAGS_DECL;
15114
15115     PERL_ARGS_ASSERT_REG_NODE;
15116
15117     if (SIZE_ONLY) {
15118         SIZE_ALIGN(RExC_size);
15119         RExC_size += 1;
15120         return(ret);
15121     }
15122     if (RExC_emit >= RExC_emit_bound)
15123         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15124                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15125
15126     NODE_ALIGN_FILL(ret);
15127     ptr = ret;
15128     FILL_ADVANCE_NODE(ptr, op);
15129 #ifdef RE_TRACK_PATTERN_OFFSETS
15130     if (RExC_offsets) {         /* MJD */
15131         MJD_OFFSET_DEBUG(
15132               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15133               "reg_node", __LINE__,
15134               PL_reg_name[op],
15135               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15136                 ? "Overwriting end of array!\n" : "OK",
15137               (UV)(RExC_emit - RExC_emit_start),
15138               (UV)(RExC_parse - RExC_start),
15139               (UV)RExC_offsets[0]));
15140         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15141     }
15142 #endif
15143     RExC_emit = ptr;
15144     return(ret);
15145 }
15146
15147 /*
15148 - reganode - emit a node with an argument
15149 */
15150 STATIC regnode *                        /* Location. */
15151 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15152 {
15153     dVAR;
15154     regnode *ptr;
15155     regnode * const ret = RExC_emit;
15156     GET_RE_DEBUG_FLAGS_DECL;
15157
15158     PERL_ARGS_ASSERT_REGANODE;
15159
15160     if (SIZE_ONLY) {
15161         SIZE_ALIGN(RExC_size);
15162         RExC_size += 2;
15163         /*
15164            We can't do this:
15165
15166            assert(2==regarglen[op]+1);
15167
15168            Anything larger than this has to allocate the extra amount.
15169            If we changed this to be:
15170
15171            RExC_size += (1 + regarglen[op]);
15172
15173            then it wouldn't matter. Its not clear what side effect
15174            might come from that so its not done so far.
15175            -- dmq
15176         */
15177         return(ret);
15178     }
15179     if (RExC_emit >= RExC_emit_bound)
15180         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15181                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15182
15183     NODE_ALIGN_FILL(ret);
15184     ptr = ret;
15185     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15186 #ifdef RE_TRACK_PATTERN_OFFSETS
15187     if (RExC_offsets) {         /* MJD */
15188         MJD_OFFSET_DEBUG(
15189               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15190               "reganode",
15191               __LINE__,
15192               PL_reg_name[op],
15193               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15194               "Overwriting end of array!\n" : "OK",
15195               (UV)(RExC_emit - RExC_emit_start),
15196               (UV)(RExC_parse - RExC_start),
15197               (UV)RExC_offsets[0]));
15198         Set_Cur_Node_Offset;
15199     }
15200 #endif
15201     RExC_emit = ptr;
15202     return(ret);
15203 }
15204
15205 /*
15206 - reguni - emit (if appropriate) a Unicode character
15207 */
15208 PERL_STATIC_INLINE STRLEN
15209 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15210 {
15211     dVAR;
15212
15213     PERL_ARGS_ASSERT_REGUNI;
15214
15215     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15216 }
15217
15218 /*
15219 - reginsert - insert an operator in front of already-emitted operand
15220 *
15221 * Means relocating the operand.
15222 */
15223 STATIC void
15224 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15225 {
15226     dVAR;
15227     regnode *src;
15228     regnode *dst;
15229     regnode *place;
15230     const int offset = regarglen[(U8)op];
15231     const int size = NODE_STEP_REGNODE + offset;
15232     GET_RE_DEBUG_FLAGS_DECL;
15233
15234     PERL_ARGS_ASSERT_REGINSERT;
15235     PERL_UNUSED_CONTEXT;
15236     PERL_UNUSED_ARG(depth);
15237 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15238     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15239     if (SIZE_ONLY) {
15240         RExC_size += size;
15241         return;
15242     }
15243
15244     src = RExC_emit;
15245     RExC_emit += size;
15246     dst = RExC_emit;
15247     if (RExC_open_parens) {
15248         int paren;
15249         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15250         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15251             if ( RExC_open_parens[paren] >= opnd ) {
15252                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15253                 RExC_open_parens[paren] += size;
15254             } else {
15255                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15256             }
15257             if ( RExC_close_parens[paren] >= opnd ) {
15258                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15259                 RExC_close_parens[paren] += size;
15260             } else {
15261                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15262             }
15263         }
15264     }
15265
15266     while (src > opnd) {
15267         StructCopy(--src, --dst, regnode);
15268 #ifdef RE_TRACK_PATTERN_OFFSETS
15269         if (RExC_offsets) {     /* MJD 20010112 */
15270             MJD_OFFSET_DEBUG(
15271                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15272                   "reg_insert",
15273                   __LINE__,
15274                   PL_reg_name[op],
15275                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15276                     ? "Overwriting end of array!\n" : "OK",
15277                   (UV)(src - RExC_emit_start),
15278                   (UV)(dst - RExC_emit_start),
15279                   (UV)RExC_offsets[0]));
15280             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15281             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15282         }
15283 #endif
15284     }
15285
15286
15287     place = opnd;               /* Op node, where operand used to be. */
15288 #ifdef RE_TRACK_PATTERN_OFFSETS
15289     if (RExC_offsets) {         /* MJD */
15290         MJD_OFFSET_DEBUG(
15291               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15292               "reginsert",
15293               __LINE__,
15294               PL_reg_name[op],
15295               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15296               ? "Overwriting end of array!\n" : "OK",
15297               (UV)(place - RExC_emit_start),
15298               (UV)(RExC_parse - RExC_start),
15299               (UV)RExC_offsets[0]));
15300         Set_Node_Offset(place, RExC_parse);
15301         Set_Node_Length(place, 1);
15302     }
15303 #endif
15304     src = NEXTOPER(place);
15305     FILL_ADVANCE_NODE(place, op);
15306     Zero(src, offset, regnode);
15307 }
15308
15309 /*
15310 - regtail - set the next-pointer at the end of a node chain of p to val.
15311 - SEE ALSO: regtail_study
15312 */
15313 /* TODO: All three parms should be const */
15314 STATIC void
15315 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15316                 const regnode *val,U32 depth)
15317 {
15318     dVAR;
15319     regnode *scan;
15320     GET_RE_DEBUG_FLAGS_DECL;
15321
15322     PERL_ARGS_ASSERT_REGTAIL;
15323 #ifndef DEBUGGING
15324     PERL_UNUSED_ARG(depth);
15325 #endif
15326
15327     if (SIZE_ONLY)
15328         return;
15329
15330     /* Find last node. */
15331     scan = p;
15332     for (;;) {
15333         regnode * const temp = regnext(scan);
15334         DEBUG_PARSE_r({
15335             SV * const mysv=sv_newmortal();
15336             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15337             regprop(RExC_rx, mysv, scan, NULL);
15338             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15339                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15340                     (temp == NULL ? "->" : ""),
15341                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15342             );
15343         });
15344         if (temp == NULL)
15345             break;
15346         scan = temp;
15347     }
15348
15349     if (reg_off_by_arg[OP(scan)]) {
15350         ARG_SET(scan, val - scan);
15351     }
15352     else {
15353         NEXT_OFF(scan) = val - scan;
15354     }
15355 }
15356
15357 #ifdef DEBUGGING
15358 /*
15359 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15360 - Look for optimizable sequences at the same time.
15361 - currently only looks for EXACT chains.
15362
15363 This is experimental code. The idea is to use this routine to perform
15364 in place optimizations on branches and groups as they are constructed,
15365 with the long term intention of removing optimization from study_chunk so
15366 that it is purely analytical.
15367
15368 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15369 to control which is which.
15370
15371 */
15372 /* TODO: All four parms should be const */
15373
15374 STATIC U8
15375 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15376                       const regnode *val,U32 depth)
15377 {
15378     dVAR;
15379     regnode *scan;
15380     U8 exact = PSEUDO;
15381 #ifdef EXPERIMENTAL_INPLACESCAN
15382     I32 min = 0;
15383 #endif
15384     GET_RE_DEBUG_FLAGS_DECL;
15385
15386     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15387
15388
15389     if (SIZE_ONLY)
15390         return exact;
15391
15392     /* Find last node. */
15393
15394     scan = p;
15395     for (;;) {
15396         regnode * const temp = regnext(scan);
15397 #ifdef EXPERIMENTAL_INPLACESCAN
15398         if (PL_regkind[OP(scan)] == EXACT) {
15399             bool unfolded_multi_char;   /* Unexamined in this routine */
15400             if (join_exact(pRExC_state, scan, &min,
15401                            &unfolded_multi_char, 1, val, depth+1))
15402                 return EXACT;
15403         }
15404 #endif
15405         if ( exact ) {
15406             switch (OP(scan)) {
15407                 case EXACT:
15408                 case EXACTF:
15409                 case EXACTFA_NO_TRIE:
15410                 case EXACTFA:
15411                 case EXACTFU:
15412                 case EXACTFU_SS:
15413                 case EXACTFL:
15414                         if( exact == PSEUDO )
15415                             exact= OP(scan);
15416                         else if ( exact != OP(scan) )
15417                             exact= 0;
15418                 case NOTHING:
15419                     break;
15420                 default:
15421                     exact= 0;
15422             }
15423         }
15424         DEBUG_PARSE_r({
15425             SV * const mysv=sv_newmortal();
15426             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15427             regprop(RExC_rx, mysv, scan, NULL);
15428             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15429                 SvPV_nolen_const(mysv),
15430                 REG_NODE_NUM(scan),
15431                 PL_reg_name[exact]);
15432         });
15433         if (temp == NULL)
15434             break;
15435         scan = temp;
15436     }
15437     DEBUG_PARSE_r({
15438         SV * const mysv_val=sv_newmortal();
15439         DEBUG_PARSE_MSG("");
15440         regprop(RExC_rx, mysv_val, val, NULL);
15441         PerlIO_printf(Perl_debug_log,
15442                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15443                       SvPV_nolen_const(mysv_val),
15444                       (IV)REG_NODE_NUM(val),
15445                       (IV)(val - scan)
15446         );
15447     });
15448     if (reg_off_by_arg[OP(scan)]) {
15449         ARG_SET(scan, val - scan);
15450     }
15451     else {
15452         NEXT_OFF(scan) = val - scan;
15453     }
15454
15455     return exact;
15456 }
15457 #endif
15458
15459 /*
15460  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15461  */
15462 #ifdef DEBUGGING
15463
15464 static void
15465 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15466 {
15467     int bit;
15468     int set=0;
15469
15470     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15471
15472     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15473         if (flags & (1<<bit)) {
15474             if (!set++ && lead)
15475                 PerlIO_printf(Perl_debug_log, "%s",lead);
15476             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15477         }
15478     }
15479     if (lead)  {
15480         if (set)
15481             PerlIO_printf(Perl_debug_log, "\n");
15482         else
15483             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15484     }
15485 }
15486
15487 static void
15488 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15489 {
15490     int bit;
15491     int set=0;
15492     regex_charset cs;
15493
15494     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15495
15496     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15497         if (flags & (1<<bit)) {
15498             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15499                 continue;
15500             }
15501             if (!set++ && lead)
15502                 PerlIO_printf(Perl_debug_log, "%s",lead);
15503             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15504         }
15505     }
15506     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15507             if (!set++ && lead) {
15508                 PerlIO_printf(Perl_debug_log, "%s",lead);
15509             }
15510             switch (cs) {
15511                 case REGEX_UNICODE_CHARSET:
15512                     PerlIO_printf(Perl_debug_log, "UNICODE");
15513                     break;
15514                 case REGEX_LOCALE_CHARSET:
15515                     PerlIO_printf(Perl_debug_log, "LOCALE");
15516                     break;
15517                 case REGEX_ASCII_RESTRICTED_CHARSET:
15518                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15519                     break;
15520                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15521                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15522                     break;
15523                 default:
15524                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15525                     break;
15526             }
15527     }
15528     if (lead)  {
15529         if (set)
15530             PerlIO_printf(Perl_debug_log, "\n");
15531         else
15532             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15533     }
15534 }
15535 #endif
15536
15537 void
15538 Perl_regdump(pTHX_ const regexp *r)
15539 {
15540 #ifdef DEBUGGING
15541     dVAR;
15542     SV * const sv = sv_newmortal();
15543     SV *dsv= sv_newmortal();
15544     RXi_GET_DECL(r,ri);
15545     GET_RE_DEBUG_FLAGS_DECL;
15546
15547     PERL_ARGS_ASSERT_REGDUMP;
15548
15549     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15550
15551     /* Header fields of interest. */
15552     if (r->anchored_substr) {
15553         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15554             RE_SV_DUMPLEN(r->anchored_substr), 30);
15555         PerlIO_printf(Perl_debug_log,
15556                       "anchored %s%s at %"IVdf" ",
15557                       s, RE_SV_TAIL(r->anchored_substr),
15558                       (IV)r->anchored_offset);
15559     } else if (r->anchored_utf8) {
15560         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15561             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15562         PerlIO_printf(Perl_debug_log,
15563                       "anchored utf8 %s%s at %"IVdf" ",
15564                       s, RE_SV_TAIL(r->anchored_utf8),
15565                       (IV)r->anchored_offset);
15566     }
15567     if (r->float_substr) {
15568         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15569             RE_SV_DUMPLEN(r->float_substr), 30);
15570         PerlIO_printf(Perl_debug_log,
15571                       "floating %s%s at %"IVdf"..%"UVuf" ",
15572                       s, RE_SV_TAIL(r->float_substr),
15573                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15574     } else if (r->float_utf8) {
15575         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15576             RE_SV_DUMPLEN(r->float_utf8), 30);
15577         PerlIO_printf(Perl_debug_log,
15578                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15579                       s, RE_SV_TAIL(r->float_utf8),
15580                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15581     }
15582     if (r->check_substr || r->check_utf8)
15583         PerlIO_printf(Perl_debug_log,
15584                       (const char *)
15585                       (r->check_substr == r->float_substr
15586                        && r->check_utf8 == r->float_utf8
15587                        ? "(checking floating" : "(checking anchored"));
15588     if (r->intflags & PREGf_NOSCAN)
15589         PerlIO_printf(Perl_debug_log, " noscan");
15590     if (r->extflags & RXf_CHECK_ALL)
15591         PerlIO_printf(Perl_debug_log, " isall");
15592     if (r->check_substr || r->check_utf8)
15593         PerlIO_printf(Perl_debug_log, ") ");
15594
15595     if (ri->regstclass) {
15596         regprop(r, sv, ri->regstclass, NULL);
15597         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15598     }
15599     if (r->intflags & PREGf_ANCH) {
15600         PerlIO_printf(Perl_debug_log, "anchored");
15601         if (r->intflags & PREGf_ANCH_BOL)
15602             PerlIO_printf(Perl_debug_log, "(BOL)");
15603         if (r->intflags & PREGf_ANCH_MBOL)
15604             PerlIO_printf(Perl_debug_log, "(MBOL)");
15605         if (r->intflags & PREGf_ANCH_SBOL)
15606             PerlIO_printf(Perl_debug_log, "(SBOL)");
15607         if (r->intflags & PREGf_ANCH_GPOS)
15608             PerlIO_printf(Perl_debug_log, "(GPOS)");
15609         PerlIO_putc(Perl_debug_log, ' ');
15610     }
15611     if (r->intflags & PREGf_GPOS_SEEN)
15612         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15613     if (r->intflags & PREGf_SKIP)
15614         PerlIO_printf(Perl_debug_log, "plus ");
15615     if (r->intflags & PREGf_IMPLICIT)
15616         PerlIO_printf(Perl_debug_log, "implicit ");
15617     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15618     if (r->extflags & RXf_EVAL_SEEN)
15619         PerlIO_printf(Perl_debug_log, "with eval ");
15620     PerlIO_printf(Perl_debug_log, "\n");
15621     DEBUG_FLAGS_r({
15622         regdump_extflags("r->extflags: ",r->extflags);
15623         regdump_intflags("r->intflags: ",r->intflags);
15624     });
15625 #else
15626     PERL_ARGS_ASSERT_REGDUMP;
15627     PERL_UNUSED_CONTEXT;
15628     PERL_UNUSED_ARG(r);
15629 #endif  /* DEBUGGING */
15630 }
15631
15632 /*
15633 - regprop - printable representation of opcode, with run time support
15634 */
15635
15636 void
15637 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15638 {
15639 #ifdef DEBUGGING
15640     dVAR;
15641     int k;
15642
15643     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15644     static const char * const anyofs[] = {
15645 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15646     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15647     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15648     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15649     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15650     || _CC_VERTSPACE != 16
15651   #error Need to adjust order of anyofs[]
15652 #endif
15653         "\\w",
15654         "\\W",
15655         "\\d",
15656         "\\D",
15657         "[:alpha:]",
15658         "[:^alpha:]",
15659         "[:lower:]",
15660         "[:^lower:]",
15661         "[:upper:]",
15662         "[:^upper:]",
15663         "[:punct:]",
15664         "[:^punct:]",
15665         "[:print:]",
15666         "[:^print:]",
15667         "[:alnum:]",
15668         "[:^alnum:]",
15669         "[:graph:]",
15670         "[:^graph:]",
15671         "[:cased:]",
15672         "[:^cased:]",
15673         "\\s",
15674         "\\S",
15675         "[:blank:]",
15676         "[:^blank:]",
15677         "[:xdigit:]",
15678         "[:^xdigit:]",
15679         "[:space:]",
15680         "[:^space:]",
15681         "[:cntrl:]",
15682         "[:^cntrl:]",
15683         "[:ascii:]",
15684         "[:^ascii:]",
15685         "\\v",
15686         "\\V"
15687     };
15688     RXi_GET_DECL(prog,progi);
15689     GET_RE_DEBUG_FLAGS_DECL;
15690
15691     PERL_ARGS_ASSERT_REGPROP;
15692
15693     sv_setpvs(sv, "");
15694
15695     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15696         /* It would be nice to FAIL() here, but this may be called from
15697            regexec.c, and it would be hard to supply pRExC_state. */
15698         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15699                                               (int)OP(o), (int)REGNODE_MAX);
15700     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15701
15702     k = PL_regkind[OP(o)];
15703
15704     if (k == EXACT) {
15705         sv_catpvs(sv, " ");
15706         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15707          * is a crude hack but it may be the best for now since
15708          * we have no flag "this EXACTish node was UTF-8"
15709          * --jhi */
15710         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15711                   PERL_PV_ESCAPE_UNI_DETECT |
15712                   PERL_PV_ESCAPE_NONASCII   |
15713                   PERL_PV_PRETTY_ELLIPSES   |
15714                   PERL_PV_PRETTY_LTGT       |
15715                   PERL_PV_PRETTY_NOCLEAR
15716                   );
15717     } else if (k == TRIE) {
15718         /* print the details of the trie in dumpuntil instead, as
15719          * progi->data isn't available here */
15720         const char op = OP(o);
15721         const U32 n = ARG(o);
15722         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15723                (reg_ac_data *)progi->data->data[n] :
15724                NULL;
15725         const reg_trie_data * const trie
15726             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15727
15728         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15729         DEBUG_TRIE_COMPILE_r(
15730           Perl_sv_catpvf(aTHX_ sv,
15731             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15732             (UV)trie->startstate,
15733             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15734             (UV)trie->wordcount,
15735             (UV)trie->minlen,
15736             (UV)trie->maxlen,
15737             (UV)TRIE_CHARCOUNT(trie),
15738             (UV)trie->uniquecharcount
15739           );
15740         );
15741         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15742             sv_catpvs(sv, "[");
15743             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15744                                                    ? ANYOF_BITMAP(o)
15745                                                    : TRIE_BITMAP(trie));
15746             sv_catpvs(sv, "]");
15747         }
15748
15749     } else if (k == CURLY) {
15750         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15751             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15752         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15753     }
15754     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15755         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15756     else if (k == REF || k == OPEN || k == CLOSE
15757              || k == GROUPP || OP(o)==ACCEPT)
15758     {
15759         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15760         if ( RXp_PAREN_NAMES(prog) ) {
15761             if ( k != REF || (OP(o) < NREF)) {
15762                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15763                 SV **name= av_fetch(list, ARG(o), 0 );
15764                 if (name)
15765                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15766             }
15767             else {
15768                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15769                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15770                 I32 *nums=(I32*)SvPVX(sv_dat);
15771                 SV **name= av_fetch(list, nums[0], 0 );
15772                 I32 n;
15773                 if (name) {
15774                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15775                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15776                                     (n ? "," : ""), (IV)nums[n]);
15777                     }
15778                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15779                 }
15780             }
15781         }
15782         if ( k == REF && reginfo) {
15783             U32 n = ARG(o);  /* which paren pair */
15784             I32 ln = prog->offs[n].start;
15785             if (prog->lastparen < n || ln == -1)
15786                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15787             else if (ln == prog->offs[n].end)
15788                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15789             else {
15790                 const char *s = reginfo->strbeg + ln;
15791                 Perl_sv_catpvf(aTHX_ sv, ": ");
15792                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15793                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15794             }
15795         }
15796     } else if (k == GOSUB)
15797         /* Paren and offset */
15798         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15799     else if (k == VERB) {
15800         if (!o->flags)
15801             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15802                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15803     } else if (k == LOGICAL)
15804         /* 2: embedded, otherwise 1 */
15805         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15806     else if (k == ANYOF) {
15807         const U8 flags = ANYOF_FLAGS(o);
15808         int do_sep = 0;
15809
15810
15811         if (flags & ANYOF_LOCALE_FLAGS)
15812             sv_catpvs(sv, "{loc}");
15813         if (flags & ANYOF_LOC_FOLD)
15814             sv_catpvs(sv, "{i}");
15815         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15816         if (flags & ANYOF_INVERT)
15817             sv_catpvs(sv, "^");
15818
15819         /* output what the standard cp 0-255 bitmap matches */
15820         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15821
15822         /* output any special charclass tests (used entirely under use
15823          * locale) * */
15824         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15825             int i;
15826             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15827                 if (ANYOF_POSIXL_TEST(o,i)) {
15828                     sv_catpv(sv, anyofs[i]);
15829                     do_sep = 1;
15830                 }
15831             }
15832         }
15833
15834         if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15835                       |ANYOF_UTF8
15836                       |ANYOF_NONBITMAP_NON_UTF8
15837                       |ANYOF_LOC_FOLD)))
15838         {
15839             if (do_sep) {
15840                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15841                 if (flags & ANYOF_INVERT)
15842                     /*make sure the invert info is in each */
15843                     sv_catpvs(sv, "^");
15844             }
15845
15846             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15847                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15848             }
15849
15850             /* output information about the unicode matching */
15851             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15852                 sv_catpvs(sv, "{unicode_all}");
15853             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15854                 SV *lv; /* Set if there is something outside the bit map. */
15855                 bool byte_output = FALSE;   /* If something in the bitmap has
15856                                                been output */
15857                 SV *only_utf8_locale;
15858
15859                 /* Get the stuff that wasn't in the bitmap */
15860                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15861                                                     &lv, &only_utf8_locale);
15862                 if (lv && lv != &PL_sv_undef) {
15863                     char *s = savesvpv(lv);
15864                     char * const origs = s;
15865
15866                     while (*s && *s != '\n')
15867                         s++;
15868
15869                     if (*s == '\n') {
15870                         const char * const t = ++s;
15871
15872                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15873                             sv_catpvs(sv, "{outside bitmap}");
15874                         }
15875                         else {
15876                             sv_catpvs(sv, "{utf8}");
15877                         }
15878
15879                         if (byte_output) {
15880                             sv_catpvs(sv, " ");
15881                         }
15882
15883                         while (*s) {
15884                             if (*s == '\n') {
15885
15886                                 /* Truncate very long output */
15887                                 if (s - origs > 256) {
15888                                     Perl_sv_catpvf(aTHX_ sv,
15889                                                 "%.*s...",
15890                                                 (int) (s - origs - 1),
15891                                                 t);
15892                                     goto out_dump;
15893                                 }
15894                                 *s = ' ';
15895                             }
15896                             else if (*s == '\t') {
15897                                 *s = '-';
15898                             }
15899                             s++;
15900                         }
15901                         if (s[-1] == ' ')
15902                             s[-1] = 0;
15903
15904                         sv_catpv(sv, t);
15905                     }
15906
15907                 out_dump:
15908
15909                     Safefree(origs);
15910                     SvREFCNT_dec_NN(lv);
15911                 }
15912
15913                 if ((flags & ANYOF_LOC_FOLD)
15914                      && only_utf8_locale
15915                      && only_utf8_locale != &PL_sv_undef)
15916                 {
15917                     UV start, end;
15918                     int max_entries = 256;
15919
15920                     sv_catpvs(sv, "{utf8 locale}");
15921                     invlist_iterinit(only_utf8_locale);
15922                     while (invlist_iternext(only_utf8_locale,
15923                                             &start, &end)) {
15924                         put_range(sv, start, end);
15925                         max_entries --;
15926                         if (max_entries < 0) {
15927                             sv_catpvs(sv, "...");
15928                             break;
15929                         }
15930                     }
15931                     invlist_iterfinish(only_utf8_locale);
15932                 }
15933             }
15934         }
15935
15936         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15937     }
15938     else if (k == POSIXD || k == NPOSIXD) {
15939         U8 index = FLAGS(o) * 2;
15940         if (index < C_ARRAY_LENGTH(anyofs)) {
15941             if (*anyofs[index] != '[')  {
15942                 sv_catpv(sv, "[");
15943             }
15944             sv_catpv(sv, anyofs[index]);
15945             if (*anyofs[index] != '[')  {
15946                 sv_catpv(sv, "]");
15947             }
15948         }
15949         else {
15950             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15951         }
15952     }
15953     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15954         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15955 #else
15956     PERL_UNUSED_CONTEXT;
15957     PERL_UNUSED_ARG(sv);
15958     PERL_UNUSED_ARG(o);
15959     PERL_UNUSED_ARG(prog);
15960     PERL_UNUSED_ARG(reginfo);
15961 #endif  /* DEBUGGING */
15962 }
15963
15964
15965
15966 SV *
15967 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15968 {                               /* Assume that RE_INTUIT is set */
15969     dVAR;
15970     struct regexp *const prog = ReANY(r);
15971     GET_RE_DEBUG_FLAGS_DECL;
15972
15973     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15974     PERL_UNUSED_CONTEXT;
15975
15976     DEBUG_COMPILE_r(
15977         {
15978             const char * const s = SvPV_nolen_const(prog->check_substr
15979                       ? prog->check_substr : prog->check_utf8);
15980
15981             if (!PL_colorset) reginitcolors();
15982             PerlIO_printf(Perl_debug_log,
15983                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15984                       PL_colors[4],
15985                       prog->check_substr ? "" : "utf8 ",
15986                       PL_colors[5],PL_colors[0],
15987                       s,
15988                       PL_colors[1],
15989                       (strlen(s) > 60 ? "..." : ""));
15990         } );
15991
15992     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15993 }
15994
15995 /*
15996    pregfree()
15997
15998    handles refcounting and freeing the perl core regexp structure. When
15999    it is necessary to actually free the structure the first thing it
16000    does is call the 'free' method of the regexp_engine associated to
16001    the regexp, allowing the handling of the void *pprivate; member
16002    first. (This routine is not overridable by extensions, which is why
16003    the extensions free is called first.)
16004
16005    See regdupe and regdupe_internal if you change anything here.
16006 */
16007 #ifndef PERL_IN_XSUB_RE
16008 void
16009 Perl_pregfree(pTHX_ REGEXP *r)
16010 {
16011     SvREFCNT_dec(r);
16012 }
16013
16014 void
16015 Perl_pregfree2(pTHX_ REGEXP *rx)
16016 {
16017     dVAR;
16018     struct regexp *const r = ReANY(rx);
16019     GET_RE_DEBUG_FLAGS_DECL;
16020
16021     PERL_ARGS_ASSERT_PREGFREE2;
16022
16023     if (r->mother_re) {
16024         ReREFCNT_dec(r->mother_re);
16025     } else {
16026         CALLREGFREE_PVT(rx); /* free the private data */
16027         SvREFCNT_dec(RXp_PAREN_NAMES(r));
16028         Safefree(r->xpv_len_u.xpvlenu_pv);
16029     }
16030     if (r->substrs) {
16031         SvREFCNT_dec(r->anchored_substr);
16032         SvREFCNT_dec(r->anchored_utf8);
16033         SvREFCNT_dec(r->float_substr);
16034         SvREFCNT_dec(r->float_utf8);
16035         Safefree(r->substrs);
16036     }
16037     RX_MATCH_COPY_FREE(rx);
16038 #ifdef PERL_ANY_COW
16039     SvREFCNT_dec(r->saved_copy);
16040 #endif
16041     Safefree(r->offs);
16042     SvREFCNT_dec(r->qr_anoncv);
16043     rx->sv_u.svu_rx = 0;
16044 }
16045
16046 /*  reg_temp_copy()
16047
16048     This is a hacky workaround to the structural issue of match results
16049     being stored in the regexp structure which is in turn stored in
16050     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16051     could be PL_curpm in multiple contexts, and could require multiple
16052     result sets being associated with the pattern simultaneously, such
16053     as when doing a recursive match with (??{$qr})
16054
16055     The solution is to make a lightweight copy of the regexp structure
16056     when a qr// is returned from the code executed by (??{$qr}) this
16057     lightweight copy doesn't actually own any of its data except for
16058     the starp/end and the actual regexp structure itself.
16059
16060 */
16061
16062
16063 REGEXP *
16064 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16065 {
16066     struct regexp *ret;
16067     struct regexp *const r = ReANY(rx);
16068     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16069
16070     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16071
16072     if (!ret_x)
16073         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16074     else {
16075         SvOK_off((SV *)ret_x);
16076         if (islv) {
16077             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16078                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16079                made both spots point to the same regexp body.) */
16080             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16081             assert(!SvPVX(ret_x));
16082             ret_x->sv_u.svu_rx = temp->sv_any;
16083             temp->sv_any = NULL;
16084             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16085             SvREFCNT_dec_NN(temp);
16086             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16087                ing below will not set it. */
16088             SvCUR_set(ret_x, SvCUR(rx));
16089         }
16090     }
16091     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16092        sv_force_normal(sv) is called.  */
16093     SvFAKE_on(ret_x);
16094     ret = ReANY(ret_x);
16095
16096     SvFLAGS(ret_x) |= SvUTF8(rx);
16097     /* We share the same string buffer as the original regexp, on which we
16098        hold a reference count, incremented when mother_re is set below.
16099        The string pointer is copied here, being part of the regexp struct.
16100      */
16101     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16102            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16103     if (r->offs) {
16104         const I32 npar = r->nparens+1;
16105         Newx(ret->offs, npar, regexp_paren_pair);
16106         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16107     }
16108     if (r->substrs) {
16109         Newx(ret->substrs, 1, struct reg_substr_data);
16110         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16111
16112         SvREFCNT_inc_void(ret->anchored_substr);
16113         SvREFCNT_inc_void(ret->anchored_utf8);
16114         SvREFCNT_inc_void(ret->float_substr);
16115         SvREFCNT_inc_void(ret->float_utf8);
16116
16117         /* check_substr and check_utf8, if non-NULL, point to either their
16118            anchored or float namesakes, and don't hold a second reference.  */
16119     }
16120     RX_MATCH_COPIED_off(ret_x);
16121 #ifdef PERL_ANY_COW
16122     ret->saved_copy = NULL;
16123 #endif
16124     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16125     SvREFCNT_inc_void(ret->qr_anoncv);
16126
16127     return ret_x;
16128 }
16129 #endif
16130
16131 /* regfree_internal()
16132
16133    Free the private data in a regexp. This is overloadable by
16134    extensions. Perl takes care of the regexp structure in pregfree(),
16135    this covers the *pprivate pointer which technically perl doesn't
16136    know about, however of course we have to handle the
16137    regexp_internal structure when no extension is in use.
16138
16139    Note this is called before freeing anything in the regexp
16140    structure.
16141  */
16142
16143 void
16144 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16145 {
16146     dVAR;
16147     struct regexp *const r = ReANY(rx);
16148     RXi_GET_DECL(r,ri);
16149     GET_RE_DEBUG_FLAGS_DECL;
16150
16151     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16152
16153     DEBUG_COMPILE_r({
16154         if (!PL_colorset)
16155             reginitcolors();
16156         {
16157             SV *dsv= sv_newmortal();
16158             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16159                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16160             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16161                 PL_colors[4],PL_colors[5],s);
16162         }
16163     });
16164 #ifdef RE_TRACK_PATTERN_OFFSETS
16165     if (ri->u.offsets)
16166         Safefree(ri->u.offsets);             /* 20010421 MJD */
16167 #endif
16168     if (ri->code_blocks) {
16169         int n;
16170         for (n = 0; n < ri->num_code_blocks; n++)
16171             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16172         Safefree(ri->code_blocks);
16173     }
16174
16175     if (ri->data) {
16176         int n = ri->data->count;
16177
16178         while (--n >= 0) {
16179           /* If you add a ->what type here, update the comment in regcomp.h */
16180             switch (ri->data->what[n]) {
16181             case 'a':
16182             case 'r':
16183             case 's':
16184             case 'S':
16185             case 'u':
16186                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16187                 break;
16188             case 'f':
16189                 Safefree(ri->data->data[n]);
16190                 break;
16191             case 'l':
16192             case 'L':
16193                 break;
16194             case 'T':
16195                 { /* Aho Corasick add-on structure for a trie node.
16196                      Used in stclass optimization only */
16197                     U32 refcount;
16198                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16199                     OP_REFCNT_LOCK;
16200                     refcount = --aho->refcount;
16201                     OP_REFCNT_UNLOCK;
16202                     if ( !refcount ) {
16203                         PerlMemShared_free(aho->states);
16204                         PerlMemShared_free(aho->fail);
16205                          /* do this last!!!! */
16206                         PerlMemShared_free(ri->data->data[n]);
16207                         /* we should only ever get called once, so
16208                          * assert as much, and also guard the free
16209                          * which /might/ happen twice. At the least
16210                          * it will make code anlyzers happy and it
16211                          * doesn't cost much. - Yves */
16212                         assert(ri->regstclass);
16213                         if (ri->regstclass) {
16214                             PerlMemShared_free(ri->regstclass);
16215                             ri->regstclass = 0;
16216                         }
16217                     }
16218                 }
16219                 break;
16220             case 't':
16221                 {
16222                     /* trie structure. */
16223                     U32 refcount;
16224                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16225                     OP_REFCNT_LOCK;
16226                     refcount = --trie->refcount;
16227                     OP_REFCNT_UNLOCK;
16228                     if ( !refcount ) {
16229                         PerlMemShared_free(trie->charmap);
16230                         PerlMemShared_free(trie->states);
16231                         PerlMemShared_free(trie->trans);
16232                         if (trie->bitmap)
16233                             PerlMemShared_free(trie->bitmap);
16234                         if (trie->jump)
16235                             PerlMemShared_free(trie->jump);
16236                         PerlMemShared_free(trie->wordinfo);
16237                         /* do this last!!!! */
16238                         PerlMemShared_free(ri->data->data[n]);
16239                     }
16240                 }
16241                 break;
16242             default:
16243                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16244                                                     ri->data->what[n]);
16245             }
16246         }
16247         Safefree(ri->data->what);
16248         Safefree(ri->data);
16249     }
16250
16251     Safefree(ri);
16252 }
16253
16254 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16255 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16256 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16257
16258 /*
16259    re_dup - duplicate a regexp.
16260
16261    This routine is expected to clone a given regexp structure. It is only
16262    compiled under USE_ITHREADS.
16263
16264    After all of the core data stored in struct regexp is duplicated
16265    the regexp_engine.dupe method is used to copy any private data
16266    stored in the *pprivate pointer. This allows extensions to handle
16267    any duplication it needs to do.
16268
16269    See pregfree() and regfree_internal() if you change anything here.
16270 */
16271 #if defined(USE_ITHREADS)
16272 #ifndef PERL_IN_XSUB_RE
16273 void
16274 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16275 {
16276     dVAR;
16277     I32 npar;
16278     const struct regexp *r = ReANY(sstr);
16279     struct regexp *ret = ReANY(dstr);
16280
16281     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16282
16283     npar = r->nparens+1;
16284     Newx(ret->offs, npar, regexp_paren_pair);
16285     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16286
16287     if (ret->substrs) {
16288         /* Do it this way to avoid reading from *r after the StructCopy().
16289            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16290            cache, it doesn't matter.  */
16291         const bool anchored = r->check_substr
16292             ? r->check_substr == r->anchored_substr
16293             : r->check_utf8 == r->anchored_utf8;
16294         Newx(ret->substrs, 1, struct reg_substr_data);
16295         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16296
16297         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16298         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16299         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16300         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16301
16302         /* check_substr and check_utf8, if non-NULL, point to either their
16303            anchored or float namesakes, and don't hold a second reference.  */
16304
16305         if (ret->check_substr) {
16306             if (anchored) {
16307                 assert(r->check_utf8 == r->anchored_utf8);
16308                 ret->check_substr = ret->anchored_substr;
16309                 ret->check_utf8 = ret->anchored_utf8;
16310             } else {
16311                 assert(r->check_substr == r->float_substr);
16312                 assert(r->check_utf8 == r->float_utf8);
16313                 ret->check_substr = ret->float_substr;
16314                 ret->check_utf8 = ret->float_utf8;
16315             }
16316         } else if (ret->check_utf8) {
16317             if (anchored) {
16318                 ret->check_utf8 = ret->anchored_utf8;
16319             } else {
16320                 ret->check_utf8 = ret->float_utf8;
16321             }
16322         }
16323     }
16324
16325     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16326     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16327
16328     if (ret->pprivate)
16329         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16330
16331     if (RX_MATCH_COPIED(dstr))
16332         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16333     else
16334         ret->subbeg = NULL;
16335 #ifdef PERL_ANY_COW
16336     ret->saved_copy = NULL;
16337 #endif
16338
16339     /* Whether mother_re be set or no, we need to copy the string.  We
16340        cannot refrain from copying it when the storage points directly to
16341        our mother regexp, because that's
16342                1: a buffer in a different thread
16343                2: something we no longer hold a reference on
16344                so we need to copy it locally.  */
16345     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16346     ret->mother_re   = NULL;
16347 }
16348 #endif /* PERL_IN_XSUB_RE */
16349
16350 /*
16351    regdupe_internal()
16352
16353    This is the internal complement to regdupe() which is used to copy
16354    the structure pointed to by the *pprivate pointer in the regexp.
16355    This is the core version of the extension overridable cloning hook.
16356    The regexp structure being duplicated will be copied by perl prior
16357    to this and will be provided as the regexp *r argument, however
16358    with the /old/ structures pprivate pointer value. Thus this routine
16359    may override any copying normally done by perl.
16360
16361    It returns a pointer to the new regexp_internal structure.
16362 */
16363
16364 void *
16365 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16366 {
16367     dVAR;
16368     struct regexp *const r = ReANY(rx);
16369     regexp_internal *reti;
16370     int len;
16371     RXi_GET_DECL(r,ri);
16372
16373     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16374
16375     len = ProgLen(ri);
16376
16377     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16378           char, regexp_internal);
16379     Copy(ri->program, reti->program, len+1, regnode);
16380
16381     reti->num_code_blocks = ri->num_code_blocks;
16382     if (ri->code_blocks) {
16383         int n;
16384         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16385                 struct reg_code_block);
16386         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16387                 struct reg_code_block);
16388         for (n = 0; n < ri->num_code_blocks; n++)
16389              reti->code_blocks[n].src_regex = (REGEXP*)
16390                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16391     }
16392     else
16393         reti->code_blocks = NULL;
16394
16395     reti->regstclass = NULL;
16396
16397     if (ri->data) {
16398         struct reg_data *d;
16399         const int count = ri->data->count;
16400         int i;
16401
16402         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16403                 char, struct reg_data);
16404         Newx(d->what, count, U8);
16405
16406         d->count = count;
16407         for (i = 0; i < count; i++) {
16408             d->what[i] = ri->data->what[i];
16409             switch (d->what[i]) {
16410                 /* see also regcomp.h and regfree_internal() */
16411             case 'a': /* actually an AV, but the dup function is identical.  */
16412             case 'r':
16413             case 's':
16414             case 'S':
16415             case 'u': /* actually an HV, but the dup function is identical.  */
16416                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16417                 break;
16418             case 'f':
16419                 /* This is cheating. */
16420                 Newx(d->data[i], 1, regnode_ssc);
16421                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16422                 reti->regstclass = (regnode*)d->data[i];
16423                 break;
16424             case 'T':
16425                 /* Trie stclasses are readonly and can thus be shared
16426                  * without duplication. We free the stclass in pregfree
16427                  * when the corresponding reg_ac_data struct is freed.
16428                  */
16429                 reti->regstclass= ri->regstclass;
16430                 /* FALLTHROUGH */
16431             case 't':
16432                 OP_REFCNT_LOCK;
16433                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16434                 OP_REFCNT_UNLOCK;
16435                 /* FALLTHROUGH */
16436             case 'l':
16437             case 'L':
16438                 d->data[i] = ri->data->data[i];
16439                 break;
16440             default:
16441                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16442                                                            ri->data->what[i]);
16443             }
16444         }
16445
16446         reti->data = d;
16447     }
16448     else
16449         reti->data = NULL;
16450
16451     reti->name_list_idx = ri->name_list_idx;
16452
16453 #ifdef RE_TRACK_PATTERN_OFFSETS
16454     if (ri->u.offsets) {
16455         Newx(reti->u.offsets, 2*len+1, U32);
16456         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16457     }
16458 #else
16459     SetProgLen(reti,len);
16460 #endif
16461
16462     return (void*)reti;
16463 }
16464
16465 #endif    /* USE_ITHREADS */
16466
16467 #ifndef PERL_IN_XSUB_RE
16468
16469 /*
16470  - regnext - dig the "next" pointer out of a node
16471  */
16472 regnode *
16473 Perl_regnext(pTHX_ regnode *p)
16474 {
16475     dVAR;
16476     I32 offset;
16477
16478     if (!p)
16479         return(NULL);
16480
16481     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16482         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16483                                                 (int)OP(p), (int)REGNODE_MAX);
16484     }
16485
16486     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16487     if (offset == 0)
16488         return(NULL);
16489
16490     return(p+offset);
16491 }
16492 #endif
16493
16494 STATIC void
16495 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16496 {
16497     va_list args;
16498     STRLEN l1 = strlen(pat1);
16499     STRLEN l2 = strlen(pat2);
16500     char buf[512];
16501     SV *msv;
16502     const char *message;
16503
16504     PERL_ARGS_ASSERT_RE_CROAK2;
16505
16506     if (l1 > 510)
16507         l1 = 510;
16508     if (l1 + l2 > 510)
16509         l2 = 510 - l1;
16510     Copy(pat1, buf, l1 , char);
16511     Copy(pat2, buf + l1, l2 , char);
16512     buf[l1 + l2] = '\n';
16513     buf[l1 + l2 + 1] = '\0';
16514     va_start(args, pat2);
16515     msv = vmess(buf, &args);
16516     va_end(args);
16517     message = SvPV_const(msv,l1);
16518     if (l1 > 512)
16519         l1 = 512;
16520     Copy(message, buf, l1 , char);
16521     /* l1-1 to avoid \n */
16522     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16523 }
16524
16525 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16526
16527 #ifndef PERL_IN_XSUB_RE
16528 void
16529 Perl_save_re_context(pTHX)
16530 {
16531     dVAR;
16532
16533     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16534     if (PL_curpm) {
16535         const REGEXP * const rx = PM_GETRE(PL_curpm);
16536         if (rx) {
16537             U32 i;
16538             for (i = 1; i <= RX_NPARENS(rx); i++) {
16539                 char digits[TYPE_CHARS(long)];
16540                 const STRLEN len = my_snprintf(digits, sizeof(digits),
16541                                                "%lu", (long)i);
16542                 GV *const *const gvp
16543                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16544
16545                 if (gvp) {
16546                     GV * const gv = *gvp;
16547                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16548                         save_scalar(gv);
16549                 }
16550             }
16551         }
16552     }
16553 }
16554 #endif
16555
16556 #ifdef DEBUGGING
16557
16558 STATIC void
16559 S_put_byte(pTHX_ SV *sv, int c)
16560 {
16561     PERL_ARGS_ASSERT_PUT_BYTE;
16562
16563     if (!isPRINT(c)) {
16564         switch (c) {
16565             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16566             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16567             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16568             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16569             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16570
16571             default:
16572                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16573                 break;
16574         }
16575     }
16576     else {
16577         const char string = c;
16578         if (c == '-' || c == ']' || c == '\\' || c == '^')
16579             sv_catpvs(sv, "\\");
16580         sv_catpvn(sv, &string, 1);
16581     }
16582 }
16583
16584 STATIC void
16585 S_put_range(pTHX_ SV *sv, UV start, UV end)
16586 {
16587
16588     /* Appends to 'sv' a displayable version of the range of code points from
16589      * 'start' to 'end' */
16590
16591     assert(start <= end);
16592
16593     PERL_ARGS_ASSERT_PUT_RANGE;
16594
16595     if (end - start < 3) {  /* Individual chars in short ranges */
16596         for (; start <= end; start++)
16597             put_byte(sv, start);
16598     }
16599     else if (   end > 255
16600              || ! isALPHANUMERIC(start)
16601              || ! isALPHANUMERIC(end)
16602              || isDIGIT(start) != isDIGIT(end)
16603              || isUPPER(start) != isUPPER(end)
16604              || isLOWER(start) != isLOWER(end)
16605
16606                 /* This final test should get optimized out except on EBCDIC
16607                  * platforms, where it causes ranges that cross discontinuities
16608                  * like i/j to be shown as hex instead of the misleading,
16609                  * e.g. H-K (since that range includes more than H, I, J, K).
16610                  * */
16611              || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16612     {
16613         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16614                        start,
16615                        (end < 256) ? end : 255);
16616     }
16617     else { /* Here, the ends of the range are both digits, or both uppercase,
16618               or both lowercase; and there's no discontinuity in the range
16619               (which could happen on EBCDIC platforms) */
16620         put_byte(sv, start);
16621         sv_catpvs(sv, "-");
16622         put_byte(sv, end);
16623     }
16624 }
16625
16626 STATIC bool
16627 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16628 {
16629     /* Appends to 'sv' a displayable version of the innards of the bracketed
16630      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16631      * output anything */
16632
16633     int i;
16634     bool has_output_anything = FALSE;
16635
16636     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16637
16638     for (i = 0; i < 256; i++) {
16639         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16640
16641             /* The character at index i should be output.  Find the next
16642              * character that should NOT be output */
16643             int j;
16644             for (j = i + 1; j <= 256; j++) {
16645                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16646                     break;
16647                 }
16648             }
16649
16650             /* Everything between them is a single range that should be output
16651              * */
16652             put_range(sv, i, j - 1);
16653             has_output_anything = TRUE;
16654             i = j;
16655         }
16656     }
16657
16658     return has_output_anything;
16659 }
16660
16661 #define CLEAR_OPTSTART \
16662     if (optstart) STMT_START {                                               \
16663         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16664                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16665         optstart=NULL;                                                       \
16666     } STMT_END
16667
16668 #define DUMPUNTIL(b,e)                                                       \
16669                     CLEAR_OPTSTART;                                          \
16670                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16671
16672 STATIC const regnode *
16673 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16674             const regnode *last, const regnode *plast,
16675             SV* sv, I32 indent, U32 depth)
16676 {
16677     dVAR;
16678     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16679     const regnode *next;
16680     const regnode *optstart= NULL;
16681
16682     RXi_GET_DECL(r,ri);
16683     GET_RE_DEBUG_FLAGS_DECL;
16684
16685     PERL_ARGS_ASSERT_DUMPUNTIL;
16686
16687 #ifdef DEBUG_DUMPUNTIL
16688     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16689         last ? last-start : 0,plast ? plast-start : 0);
16690 #endif
16691
16692     if (plast && plast < last)
16693         last= plast;
16694
16695     while (PL_regkind[op] != END && (!last || node < last)) {
16696         assert(node);
16697         /* While that wasn't END last time... */
16698         NODE_ALIGN(node);
16699         op = OP(node);
16700         if (op == CLOSE || op == WHILEM)
16701             indent--;
16702         next = regnext((regnode *)node);
16703
16704         /* Where, what. */
16705         if (OP(node) == OPTIMIZED) {
16706             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16707                 optstart = node;
16708             else
16709                 goto after_print;
16710         } else
16711             CLEAR_OPTSTART;
16712
16713         regprop(r, sv, node, NULL);
16714         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16715                       (int)(2*indent + 1), "", SvPVX_const(sv));
16716
16717         if (OP(node) != OPTIMIZED) {
16718             if (next == NULL)           /* Next ptr. */
16719                 PerlIO_printf(Perl_debug_log, " (0)");
16720             else if (PL_regkind[(U8)op] == BRANCH
16721                      && PL_regkind[OP(next)] != BRANCH )
16722                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16723             else
16724                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16725             (void)PerlIO_putc(Perl_debug_log, '\n');
16726         }
16727
16728       after_print:
16729         if (PL_regkind[(U8)op] == BRANCHJ) {
16730             assert(next);
16731             {
16732                 const regnode *nnode = (OP(next) == LONGJMP
16733                                        ? regnext((regnode *)next)
16734                                        : next);
16735                 if (last && nnode > last)
16736                     nnode = last;
16737                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16738             }
16739         }
16740         else if (PL_regkind[(U8)op] == BRANCH) {
16741             assert(next);
16742             DUMPUNTIL(NEXTOPER(node), next);
16743         }
16744         else if ( PL_regkind[(U8)op]  == TRIE ) {
16745             const regnode *this_trie = node;
16746             const char op = OP(node);
16747             const U32 n = ARG(node);
16748             const reg_ac_data * const ac = op>=AHOCORASICK ?
16749                (reg_ac_data *)ri->data->data[n] :
16750                NULL;
16751             const reg_trie_data * const trie =
16752                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16753 #ifdef DEBUGGING
16754             AV *const trie_words
16755                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16756 #endif
16757             const regnode *nextbranch= NULL;
16758             I32 word_idx;
16759             sv_setpvs(sv, "");
16760             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16761                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16762
16763                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16764                    (int)(2*(indent+3)), "",
16765                     elem_ptr
16766                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16767                                 SvCUR(*elem_ptr), 60,
16768                                 PL_colors[0], PL_colors[1],
16769                                 (SvUTF8(*elem_ptr)
16770                                  ? PERL_PV_ESCAPE_UNI
16771                                  : 0)
16772                                 | PERL_PV_PRETTY_ELLIPSES
16773                                 | PERL_PV_PRETTY_LTGT
16774                             )
16775                     : "???"
16776                 );
16777                 if (trie->jump) {
16778                     U16 dist= trie->jump[word_idx+1];
16779                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16780                                (UV)((dist ? this_trie + dist : next) - start));
16781                     if (dist) {
16782                         if (!nextbranch)
16783                             nextbranch= this_trie + trie->jump[0];
16784                         DUMPUNTIL(this_trie + dist, nextbranch);
16785                     }
16786                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16787                         nextbranch= regnext((regnode *)nextbranch);
16788                 } else {
16789                     PerlIO_printf(Perl_debug_log, "\n");
16790                 }
16791             }
16792             if (last && next > last)
16793                 node= last;
16794             else
16795                 node= next;
16796         }
16797         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16798             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16799                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16800         }
16801         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16802             assert(next);
16803             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16804         }
16805         else if ( op == PLUS || op == STAR) {
16806             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16807         }
16808         else if (PL_regkind[(U8)op] == ANYOF) {
16809             /* arglen 1 + class block */
16810             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16811                           ? ANYOF_POSIXL_SKIP
16812                           : ANYOF_SKIP);
16813             node = NEXTOPER(node);
16814         }
16815         else if (PL_regkind[(U8)op] == EXACT) {
16816             /* Literal string, where present. */
16817             node += NODE_SZ_STR(node) - 1;
16818             node = NEXTOPER(node);
16819         }
16820         else {
16821             node = NEXTOPER(node);
16822             node += regarglen[(U8)op];
16823         }
16824         if (op == CURLYX || op == OPEN)
16825             indent++;
16826     }
16827     CLEAR_OPTSTART;
16828 #ifdef DEBUG_DUMPUNTIL
16829     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16830 #endif
16831     return node;
16832 }
16833
16834 #endif  /* DEBUGGING */
16835
16836 /*
16837  * Local variables:
16838  * c-indentation-style: bsd
16839  * c-basic-offset: 4
16840  * indent-tabs-mode: nil
16841  * End:
16842  *
16843  * ex: set ts=8 sts=4 sw=4 et:
16844  */