regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     I32         whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 130                                            implies compiling, so don't emit */
 131     regnode     emit_dummy;             /* placeholder for emit to point to */
 132     I32         naughty;                /* How bad is this pattern? */
 133     I32         sawback;                /* Did we see \1, ...? */
 134     U32         seen;
 135     I32         size;                   /* Code size. */
 136     I32         npar;                   /* Capture buffer count, (OPEN). */
 137     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 138     I32         nestroot;               /* root parens we are in - used by accept */
 139     I32         extralen;
 140     I32         seen_zerolen;
 141     regnode     **open_parens;          /* pointers to open parens */
 142     regnode     **close_parens;         /* pointers to close parens */
 143     regnode     *opend;                 /* END node in program */
 144     I32         utf8;           /* whether the pattern is utf8 or not */
 145     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 146                                 /* XXX use this for future optimisation of case
 147                                  * where pattern must be upgraded to utf8. */
 148     I32         uni_semantics;  /* If a d charset modifier should use unicode
 149                                    rules, even if the pattern is not in
 150                                    utf8 */
 151     HV          *paren_names;           /* Paren names */
 152
 153     regnode     **recurse;              /* Recurse regops */
 154     I32         recurse_count;          /* Number of recurse regops */
 155     I32         in_lookbehind;
 156     I32         contains_locale;
 157     I32         override_recoding;
 158     I32         in_multi_char_class;
 159     struct reg_code_block *code_blocks; /* positions of literal (?{})
 160                                             within pattern */
 161     int         num_code_blocks;        /* size of code_blocks[] */
 162     int         code_index;             /* next code_blocks[] slot */
 163 #if ADD_TO_REGEXEC
 164     char        *starttry;              /* -Dr: where regtry was called. */
 165 #define RExC_starttry   (pRExC_state->starttry)
 166 #endif
 167     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169     const char  *lastparse;
 170     I32         lastnum;
 171     AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse  (pRExC_state->lastparse)
 173 #define RExC_lastnum    (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 } RExC_state_t;
 177
 178 #define RExC_flags      (pRExC_state->flags)
 179 #define RExC_pm_flags   (pRExC_state->pm_flags)
 180 #define RExC_precomp    (pRExC_state->precomp)
 181 #define RExC_rx_sv      (pRExC_state->rx_sv)
 182 #define RExC_rx         (pRExC_state->rx)
 183 #define RExC_rxi        (pRExC_state->rxi)
 184 #define RExC_start      (pRExC_state->start)
 185 #define RExC_end        (pRExC_state->end)
 186 #define RExC_parse      (pRExC_state->parse)
 187 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 190 #endif
 191 #define RExC_emit       (pRExC_state->emit)
 192 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 193 #define RExC_emit_start (pRExC_state->emit_start)
 194 #define RExC_emit_bound (pRExC_state->emit_bound)
 195 #define RExC_naughty    (pRExC_state->naughty)
 196 #define RExC_sawback    (pRExC_state->sawback)
 197 #define RExC_seen       (pRExC_state->seen)
 198 #define RExC_size       (pRExC_state->size)
 199 #define RExC_npar       (pRExC_state->npar)
 200 #define RExC_nestroot   (pRExC_state->nestroot)
 201 #define RExC_extralen   (pRExC_state->extralen)
 202 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 203 #define RExC_utf8       (pRExC_state->utf8)
 204 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 205 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 206 #define RExC_open_parens        (pRExC_state->open_parens)
 207 #define RExC_close_parens       (pRExC_state->close_parens)
 208 #define RExC_opend      (pRExC_state->opend)
 209 #define RExC_paren_names        (pRExC_state->paren_names)
 210 #define RExC_recurse    (pRExC_state->recurse)
 211 #define RExC_recurse_count      (pRExC_state->recurse_count)
 212 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 213 #define RExC_contains_locale    (pRExC_state->contains_locale)
 214 #define RExC_override_recoding (pRExC_state->override_recoding)
 215 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 216
 217
 218 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 219 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 220         ((*s) == '{' && regcurly(s, FALSE)))
 221
 222 #ifdef SPSTART
 223 #undef SPSTART          /* dratted cpp namespace... */
 224 #endif
 225 /*
 226  * Flags to be passed up and down.
 227  */
 228 #define WORST           0       /* Worst case. */
 229 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 230
 231 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 232  * character.  (There needs to be a case: in the switch statement in regexec.c
 233  * for any node marked SIMPLE.)  Note that this is not the same thing as
 234  * REGNODE_SIMPLE */
 235 #define SIMPLE          0x02
 236 #define SPSTART         0x04    /* Starts with * or + */
 237 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 238 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 239 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 240
 241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 242
 243 /* whether trie related optimizations are enabled */
 244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 245 #define TRIE_STUDY_OPT
 246 #define FULL_TRIE_STUDY
 247 #define TRIE_STCLASS
 248 #endif
 249
 250
 251
 252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 253 #define PBITVAL(paren) (1 << ((paren) & 7))
 254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 257
 258 #define REQUIRE_UTF8    STMT_START {                                       \
 259                                      if (!UTF) {                           \
 260                                          *flagp = RESTART_UTF8;            \
 261                                          return NULL;                      \
 262                                      }                                     \
 263                         } STMT_END
 264
 265 /* This converts the named class defined in regcomp.h to its equivalent class
 266  * number defined in handy.h. */
 267 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 268 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 269
 270 /* About scan_data_t.
 271
 272   During optimisation we recurse through the regexp program performing
 273   various inplace (keyhole style) optimisations. In addition study_chunk
 274   and scan_commit populate this data structure with information about
 275   what strings MUST appear in the pattern. We look for the longest
 276   string that must appear at a fixed location, and we look for the
 277   longest string that may appear at a floating location. So for instance
 278   in the pattern:
 279
 280     /FOO[xX]A.*B[xX]BAR/
 281
 282   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 283   strings (because they follow a .* construct). study_chunk will identify
 284   both FOO and BAR as being the longest fixed and floating strings respectively.
 285
 286   The strings can be composites, for instance
 287
 288      /(f)(o)(o)/
 289
 290   will result in a composite fixed substring 'foo'.
 291
 292   For each string some basic information is maintained:
 293
 294   - offset or min_offset
 295     This is the position the string must appear at, or not before.
 296     It also implicitly (when combined with minlenp) tells us how many
 297     characters must match before the string we are searching for.
 298     Likewise when combined with minlenp and the length of the string it
 299     tells us how many characters must appear after the string we have
 300     found.
 301
 302   - max_offset
 303     Only used for floating strings. This is the rightmost point that
 304     the string can appear at. If set to I32 max it indicates that the
 305     string can occur infinitely far to the right.
 306
 307   - minlenp
 308     A pointer to the minimum number of characters of the pattern that the
 309     string was found inside. This is important as in the case of positive
 310     lookahead or positive lookbehind we can have multiple patterns
 311     involved. Consider
 312
 313     /(?=FOO).*F/
 314
 315     The minimum length of the pattern overall is 3, the minimum length
 316     of the lookahead part is 3, but the minimum length of the part that
 317     will actually match is 1. So 'FOO's minimum length is 3, but the
 318     minimum length for the F is 1. This is important as the minimum length
 319     is used to determine offsets in front of and behind the string being
 320     looked for.  Since strings can be composites this is the length of the
 321     pattern at the time it was committed with a scan_commit. Note that
 322     the length is calculated by study_chunk, so that the minimum lengths
 323     are not known until the full pattern has been compiled, thus the
 324     pointer to the value.
 325
 326   - lookbehind
 327
 328     In the case of lookbehind the string being searched for can be
 329     offset past the start point of the final matching string.
 330     If this value was just blithely removed from the min_offset it would
 331     invalidate some of the calculations for how many chars must match
 332     before or after (as they are derived from min_offset and minlen and
 333     the length of the string being searched for).
 334     When the final pattern is compiled and the data is moved from the
 335     scan_data_t structure into the regexp structure the information
 336     about lookbehind is factored in, with the information that would
 337     have been lost precalculated in the end_shift field for the
 338     associated string.
 339
 340   The fields pos_min and pos_delta are used to store the minimum offset
 341   and the delta to the maximum offset at the current point in the pattern.
 342
 343 */
 344
 345 typedef struct scan_data_t {
 346     /*I32 len_min;      unused */
 347     /*I32 len_delta;    unused */
 348     I32 pos_min;
 349     I32 pos_delta;
 350     SV *last_found;
 351     I32 last_end;           /* min value, <0 unless valid. */
 352     I32 last_start_min;
 353     I32 last_start_max;
 354     SV **longest;           /* Either &l_fixed, or &l_float. */
 355     SV *longest_fixed;      /* longest fixed string found in pattern */
 356     I32 offset_fixed;       /* offset where it starts */
 357     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 358     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 359     SV *longest_float;      /* longest floating string found in pattern */
 360     I32 offset_float_min;   /* earliest point in string it can appear */
 361     I32 offset_float_max;   /* latest point in string it can appear */
 362     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 363     I32 lookbehind_float;   /* is the position of the string modified by LB */
 364     I32 flags;
 365     I32 whilem_c;
 366     I32 *last_closep;
 367     struct regnode_charclass_class *start_class;
 368 } scan_data_t;
 369
 370 /* The below is perhaps overboard, but this allows us to save a test at the
 371  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 372  * and 'a' differ by a single bit; the same with the upper and lower case of
 373  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 374  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 375  * then inverts it to form a mask, with just a single 0, in the bit position
 376  * where the upper- and lowercase differ.  XXX There are about 40 other
 377  * instances in the Perl core where this micro-optimization could be used.
 378  * Should decide if maintenance cost is worse, before changing those
 379  *
 380  * Returns a boolean as to whether or not 'v' is either a lowercase or
 381  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 382  * compile-time constant, the generated code is better than some optimizing
 383  * compilers figure out, amounting to a mask and test.  The results are
 384  * meaningless if 'c' is not one of [A-Za-z] */
 385 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 386                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 387
 388 /*
 389  * Forward declarations for pregcomp()'s friends.
 390  */
 391
 392 static const scan_data_t zero_scan_data =
 393   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 394
 395 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 396 #define SF_BEFORE_SEOL          0x0001
 397 #define SF_BEFORE_MEOL          0x0002
 398 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 399 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 400
 401 #ifdef NO_UNARY_PLUS
 402 #  define SF_FIX_SHIFT_EOL      (0+2)
 403 #  define SF_FL_SHIFT_EOL               (0+4)
 404 #else
 405 #  define SF_FIX_SHIFT_EOL      (+2)
 406 #  define SF_FL_SHIFT_EOL               (+4)
 407 #endif
 408
 409 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 410 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 411
 412 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 413 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 414 #define SF_IS_INF               0x0040
 415 #define SF_HAS_PAR              0x0080
 416 #define SF_IN_PAR               0x0100
 417 #define SF_HAS_EVAL             0x0200
 418 #define SCF_DO_SUBSTR           0x0400
 419 #define SCF_DO_STCLASS_AND      0x0800
 420 #define SCF_DO_STCLASS_OR       0x1000
 421 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 422 #define SCF_WHILEM_VISITED_POS  0x2000
 423
 424 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 425 #define SCF_SEEN_ACCEPT         0x8000
 426 #define SCF_TRIE_DOING_RESTUDY 0x10000
 427
 428 #define UTF cBOOL(RExC_utf8)
 429
 430 /* The enums for all these are ordered so things work out correctly */
 431 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 432 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 433 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 434 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 435 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 436 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 437 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 438
 439 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 440
 441 #define OOB_NAMEDCLASS          -1
 442
 443 /* There is no code point that is out-of-bounds, so this is problematic.  But
 444  * its only current use is to initialize a variable that is always set before
 445  * looked at. */
 446 #define OOB_UNICODE             0xDEADBEEF
 447
 448 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 449 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 450
 451
 452 /* length of regex to show in messages that don't mark a position within */
 453 #define RegexLengthToShowInErrorMessages 127
 454
 455 /*
 456  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 457  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 458  * op/pragma/warn/regcomp.
 459  */
 460 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 461 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 462
 463 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 464
 465 /*
 466  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 467  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 468  * "...".
 469  */
 470 #define _FAIL(code) STMT_START {                                        \
 471     const char *ellipses = "";                                          \
 472     IV len = RExC_end - RExC_precomp;                                   \
 473                                                                         \
 474     if (!SIZE_ONLY)                                                     \
 475         SAVEFREESV(RExC_rx_sv);                                         \
 476     if (len > RegexLengthToShowInErrorMessages) {                       \
 477         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 478         len = RegexLengthToShowInErrorMessages - 10;                    \
 479         ellipses = "...";                                               \
 480     }                                                                   \
 481     code;                                                               \
 482 } STMT_END
 483
 484 #define FAIL(msg) _FAIL(                            \
 485     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 486             msg, (int)len, RExC_precomp, ellipses))
 487
 488 #define FAIL2(msg,arg) _FAIL(                       \
 489     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 490             arg, (int)len, RExC_precomp, ellipses))
 491
 492 /*
 493  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 494  */
 495 #define Simple_vFAIL(m) STMT_START {                                    \
 496     const IV offset = RExC_parse - RExC_precomp;                        \
 497     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 498             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 499 } STMT_END
 500
 501 /*
 502  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 503  */
 504 #define vFAIL(m) STMT_START {                           \
 505     if (!SIZE_ONLY)                                     \
 506         SAVEFREESV(RExC_rx_sv);                         \
 507     Simple_vFAIL(m);                                    \
 508 } STMT_END
 509
 510 /*
 511  * Like Simple_vFAIL(), but accepts two arguments.
 512  */
 513 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 514     const IV offset = RExC_parse - RExC_precomp;                        \
 515     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 516             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 521  */
 522 #define vFAIL2(m,a1) STMT_START {                       \
 523     if (!SIZE_ONLY)                                     \
 524         SAVEFREESV(RExC_rx_sv);                         \
 525     Simple_vFAIL2(m, a1);                               \
 526 } STMT_END
 527
 528
 529 /*
 530  * Like Simple_vFAIL(), but accepts three arguments.
 531  */
 532 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 533     const IV offset = RExC_parse - RExC_precomp;                \
 534     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 535             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 536 } STMT_END
 537
 538 /*
 539  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 540  */
 541 #define vFAIL3(m,a1,a2) STMT_START {                    \
 542     if (!SIZE_ONLY)                                     \
 543         SAVEFREESV(RExC_rx_sv);                         \
 544     Simple_vFAIL3(m, a1, a2);                           \
 545 } STMT_END
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts four arguments.
 549  */
 550 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 551     const IV offset = RExC_parse - RExC_precomp;                \
 552     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 553             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 554 } STMT_END
 555
 556 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 557     if (!SIZE_ONLY)                                     \
 558         SAVEFREESV(RExC_rx_sv);                         \
 559     Simple_vFAIL4(m, a1, a2, a3);                       \
 560 } STMT_END
 561
 562 /* m is not necessarily a "literal string", in this macro */
 563 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 564     const IV offset = loc - RExC_precomp;                               \
 565     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 566             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 567 } STMT_END
 568
 569 #define ckWARNreg(loc,m) STMT_START {                                   \
 570     const IV offset = loc - RExC_precomp;                               \
 571     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 572             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 573 } STMT_END
 574
 575 #define vWARN_dep(loc, m) STMT_START {                                  \
 576     const IV offset = loc - RExC_precomp;                               \
 577     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 578             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 579 } STMT_END
 580
 581 #define ckWARNdep(loc,m) STMT_START {                                   \
 582     const IV offset = loc - RExC_precomp;                               \
 583     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 584             m REPORT_LOCATION,                                          \
 585             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 586 } STMT_END
 587
 588 #define ckWARNregdep(loc,m) STMT_START {                                \
 589     const IV offset = loc - RExC_precomp;                               \
 590     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 591             m REPORT_LOCATION,                                          \
 592             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 593 } STMT_END
 594
 595 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 596     const IV offset = loc - RExC_precomp;                               \
 597     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 598             m REPORT_LOCATION,                                          \
 599             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 600 } STMT_END
 601
 602 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 603     const IV offset = loc - RExC_precomp;                               \
 604     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 605             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 606 } STMT_END
 607
 608 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 609     const IV offset = loc - RExC_precomp;                               \
 610     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 611             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 612 } STMT_END
 613
 614 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 615     const IV offset = loc - RExC_precomp;                               \
 616     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 617             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 618 } STMT_END
 619
 620 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 621     const IV offset = loc - RExC_precomp;                               \
 622     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 623             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 624 } STMT_END
 625
 626 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 627     const IV offset = loc - RExC_precomp;                               \
 628     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 629             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 630 } STMT_END
 631
 632 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 633     const IV offset = loc - RExC_precomp;                               \
 634     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 635             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 636 } STMT_END
 637
 638
 639 /* Allow for side effects in s */
 640 #define REGC(c,s) STMT_START {                  \
 641     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 642 } STMT_END
 643
 644 /* Macros for recording node offsets.   20001227 mjd@plover.com
 645  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 646  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 647  * Element 0 holds the number n.
 648  * Position is 1 indexed.
 649  */
 650 #ifndef RE_TRACK_PATTERN_OFFSETS
 651 #define Set_Node_Offset_To_R(node,byte)
 652 #define Set_Node_Offset(node,byte)
 653 #define Set_Cur_Node_Offset
 654 #define Set_Node_Length_To_R(node,len)
 655 #define Set_Node_Length(node,len)
 656 #define Set_Node_Cur_Length(node,start)
 657 #define Node_Offset(n)
 658 #define Node_Length(n)
 659 #define Set_Node_Offset_Length(node,offset,len)
 660 #define ProgLen(ri) ri->u.proglen
 661 #define SetProgLen(ri,x) ri->u.proglen = x
 662 #else
 663 #define ProgLen(ri) ri->u.offsets[0]
 664 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 665 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 666     if (! SIZE_ONLY) {                                                  \
 667         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 668                     __LINE__, (int)(node), (int)(byte)));               \
 669         if((node) < 0) {                                                \
 670             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 671         } else {                                                        \
 672             RExC_offsets[2*(node)-1] = (byte);                          \
 673         }                                                               \
 674     }                                                                   \
 675 } STMT_END
 676
 677 #define Set_Node_Offset(node,byte) \
 678     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 679 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 680
 681 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 682     if (! SIZE_ONLY) {                                                  \
 683         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 684                 __LINE__, (int)(node), (int)(len)));                    \
 685         if((node) < 0) {                                                \
 686             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 687         } else {                                                        \
 688             RExC_offsets[2*(node)] = (len);                             \
 689         }                                                               \
 690     }                                                                   \
 691 } STMT_END
 692
 693 #define Set_Node_Length(node,len) \
 694     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 695 #define Set_Node_Cur_Length(node, start)                \
 696     Set_Node_Length(node, RExC_parse - start)
 697
 698 /* Get offsets and lengths */
 699 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 700 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 701
 702 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 703     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 704     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 705 } STMT_END
 706 #endif
 707
 708 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 709 #define EXPERIMENTAL_INPLACESCAN
 710 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 711
 712 #define DEBUG_STUDYDATA(str,data,depth)                              \
 713 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 714     PerlIO_printf(Perl_debug_log,                                    \
 715         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 716         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 717         (int)(depth)*2, "",                                          \
 718         (IV)((data)->pos_min),                                       \
 719         (IV)((data)->pos_delta),                                     \
 720         (UV)((data)->flags),                                         \
 721         (IV)((data)->whilem_c),                                      \
 722         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 723         is_inf ? "INF " : ""                                         \
 724     );                                                               \
 725     if ((data)->last_found)                                          \
 726         PerlIO_printf(Perl_debug_log,                                \
 727             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 728             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 729             SvPVX_const((data)->last_found),                         \
 730             (IV)((data)->last_end),                                  \
 731             (IV)((data)->last_start_min),                            \
 732             (IV)((data)->last_start_max),                            \
 733             ((data)->longest &&                                      \
 734              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 735             SvPVX_const((data)->longest_fixed),                      \
 736             (IV)((data)->offset_fixed),                              \
 737             ((data)->longest &&                                      \
 738              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 739             SvPVX_const((data)->longest_float),                      \
 740             (IV)((data)->offset_float_min),                          \
 741             (IV)((data)->offset_float_max)                           \
 742         );                                                           \
 743     PerlIO_printf(Perl_debug_log,"\n");                              \
 744 });
 745
 746 /* Mark that we cannot extend a found fixed substring at this point.
 747    Update the longest found anchored substring and the longest found
 748    floating substrings if needed. */
 749
 750 STATIC void
 751 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 752 {
 753     const STRLEN l = CHR_SVLEN(data->last_found);
 754     const STRLEN old_l = CHR_SVLEN(*data->longest);
 755     GET_RE_DEBUG_FLAGS_DECL;
 756
 757     PERL_ARGS_ASSERT_SCAN_COMMIT;
 758
 759     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 760         SvSetMagicSV(*data->longest, data->last_found);
 761         if (*data->longest == data->longest_fixed) {
 762             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 763             if (data->flags & SF_BEFORE_EOL)
 764                 data->flags
 765                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 766             else
 767                 data->flags &= ~SF_FIX_BEFORE_EOL;
 768             data->minlen_fixed=minlenp;
 769             data->lookbehind_fixed=0;
 770         }
 771         else { /* *data->longest == data->longest_float */
 772             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 773             data->offset_float_max = (l
 774                                       ? data->last_start_max
 775                                       : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
 776             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 777                 data->offset_float_max = I32_MAX;
 778             if (data->flags & SF_BEFORE_EOL)
 779                 data->flags
 780                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 781             else
 782                 data->flags &= ~SF_FL_BEFORE_EOL;
 783             data->minlen_float=minlenp;
 784             data->lookbehind_float=0;
 785         }
 786     }
 787     SvCUR_set(data->last_found, 0);
 788     {
 789         SV * const sv = data->last_found;
 790         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 791             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 792             if (mg)
 793                 mg->mg_len = 0;
 794         }
 795     }
 796     data->last_end = -1;
 797     data->flags &= ~SF_BEFORE_EOL;
 798     DEBUG_STUDYDATA("commit: ",data,0);
 799 }
 800
 801 /* These macros set, clear and test whether the synthetic start class ('ssc',
 802  * given by the parameter) matches an empty string (EOS).  This uses the
 803  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 804  * stands alone, so there is never a next_off, so this field is otherwise
 805  * unused.  The EOS information is used only for compilation, but theoretically
 806  * it could be passed on to the execution code.  This could be used to store
 807  * more than one bit of information, but only this one is currently used. */
 808 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 809 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 810 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 811
 812 /* Can match anything (initialization) */
 813 STATIC void
 814 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 815 {
 816     PERL_ARGS_ASSERT_CL_ANYTHING;
 817
 818     ANYOF_BITMAP_SETALL(cl);
 819     cl->flags = ANYOF_UNICODE_ALL;
 820     SET_SSC_EOS(cl);
 821
 822     /* If any portion of the regex is to operate under locale rules,
 823      * initialization includes it.  The reason this isn't done for all regexes
 824      * is that the optimizer was written under the assumption that locale was
 825      * all-or-nothing.  Given the complexity and lack of documentation in the
 826      * optimizer, and that there are inadequate test cases for locale, so many
 827      * parts of it may not work properly, it is safest to avoid locale unless
 828      * necessary. */
 829     if (RExC_contains_locale) {
 830         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 831         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 832     }
 833     else {
 834         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 835     }
 836 }
 837
 838 /* Can match anything (initialization) */
 839 STATIC int
 840 S_cl_is_anything(const struct regnode_charclass_class *cl)
 841 {
 842     int value;
 843
 844     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 845
 846     for (value = 0; value < ANYOF_MAX; value += 2)
 847         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 848             return 1;
 849     if (!(cl->flags & ANYOF_UNICODE_ALL))
 850         return 0;
 851     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 852         return 0;
 853     return 1;
 854 }
 855
 856 /* Can match anything (initialization) */
 857 STATIC void
 858 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 859 {
 860     PERL_ARGS_ASSERT_CL_INIT;
 861
 862     Zero(cl, 1, struct regnode_charclass_class);
 863     cl->type = ANYOF;
 864     cl_anything(pRExC_state, cl);
 865     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 866 }
 867
 868 /* These two functions currently do the exact same thing */
 869 #define cl_init_zero            cl_init
 870
 871 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 872  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 873  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 874 STATIC void
 875 S_cl_and(struct regnode_charclass_class *cl,
 876         const struct regnode_charclass_class *and_with)
 877 {
 878     PERL_ARGS_ASSERT_CL_AND;
 879
 880     assert(PL_regkind[and_with->type] == ANYOF);
 881
 882     /* I (khw) am not sure all these restrictions are necessary XXX */
 883     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 884         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 885         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 886         && !(and_with->flags & ANYOF_LOC_FOLD)
 887         && !(cl->flags & ANYOF_LOC_FOLD)) {
 888         int i;
 889
 890         if (and_with->flags & ANYOF_INVERT)
 891             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 892                 cl->bitmap[i] &= ~and_with->bitmap[i];
 893         else
 894             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 895                 cl->bitmap[i] &= and_with->bitmap[i];
 896     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 897
 898     if (and_with->flags & ANYOF_INVERT) {
 899
 900         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 901          * aren't affected by the inversion.  Those that are affected are
 902          * handled individually below */
 903         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 904         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 905         cl->flags |= affected_flags;
 906
 907         /* We currently don't know how to deal with things that aren't in the
 908          * bitmap, but we know that the intersection is no greater than what
 909          * is already in cl, so let there be false positives that get sorted
 910          * out after the synthetic start class succeeds, and the node is
 911          * matched for real. */
 912
 913         /* The inversion of these two flags indicate that the resulting
 914          * intersection doesn't have them */
 915         if (and_with->flags & ANYOF_UNICODE_ALL) {
 916             cl->flags &= ~ANYOF_UNICODE_ALL;
 917         }
 918         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 919             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 920         }
 921     }
 922     else {   /* and'd node is not inverted */
 923         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 924
 925         if (! ANYOF_NONBITMAP(and_with)) {
 926
 927             /* Here 'and_with' doesn't match anything outside the bitmap
 928              * (except possibly ANYOF_UNICODE_ALL), which means the
 929              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 930              * which case we don't know what the intersection is, but it's no
 931              * greater than what cl already has, so can just leave it alone,
 932              * with possible false positives */
 933             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 934                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 935                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 936             }
 937         }
 938         else if (! ANYOF_NONBITMAP(cl)) {
 939
 940             /* Here, 'and_with' does match something outside the bitmap, and cl
 941              * doesn't have a list of things to match outside the bitmap.  If
 942              * cl can match all code points above 255, the intersection will
 943              * be those above-255 code points that 'and_with' matches.  If cl
 944              * can't match all Unicode code points, it means that it can't
 945              * match anything outside the bitmap (since the 'if' that got us
 946              * into this block tested for that), so we leave the bitmap empty.
 947              */
 948             if (cl->flags & ANYOF_UNICODE_ALL) {
 949                 ARG_SET(cl, ARG(and_with));
 950
 951                 /* and_with's ARG may match things that don't require UTF8.
 952                  * And now cl's will too, in spite of this being an 'and'.  See
 953                  * the comments below about the kludge */
 954                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 955             }
 956         }
 957         else {
 958             /* Here, both 'and_with' and cl match something outside the
 959              * bitmap.  Currently we do not do the intersection, so just match
 960              * whatever cl had at the beginning.  */
 961         }
 962
 963
 964         /* Take the intersection of the two sets of flags.  However, the
 965          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 966          * kludge around the fact that this flag is not treated like the others
 967          * which are initialized in cl_anything().  The way the optimizer works
 968          * is that the synthetic start class (SSC) is initialized to match
 969          * anything, and then the first time a real node is encountered, its
 970          * values are AND'd with the SSC's with the result being the values of
 971          * the real node.  However, there are paths through the optimizer where
 972          * the AND never gets called, so those initialized bits are set
 973          * inappropriately, which is not usually a big deal, as they just cause
 974          * false positives in the SSC, which will just mean a probably
 975          * imperceptible slow down in execution.  However this bit has a
 976          * higher false positive consequence in that it can cause utf8.pm,
 977          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 978          * bigger slowdown and also causes significant extra memory to be used.
 979          * In order to prevent this, the code now takes a different tack.  The
 980          * bit isn't set unless some part of the regular expression needs it,
 981          * but once set it won't get cleared.  This means that these extra
 982          * modules won't get loaded unless there was some path through the
 983          * pattern that would have required them anyway, and  so any false
 984          * positives that occur by not ANDing them out when they could be
 985          * aren't as severe as they would be if we treated this bit like all
 986          * the others */
 987         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 988                                       & ANYOF_NONBITMAP_NON_UTF8;
 989         cl->flags &= and_with->flags;
 990         cl->flags |= outside_bitmap_but_not_utf8;
 991     }
 992 }
 993
 994 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 995  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 996  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 997 STATIC void
 998 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 999 {
1000     PERL_ARGS_ASSERT_CL_OR;
1001
1002     if (or_with->flags & ANYOF_INVERT) {
1003
1004         /* Here, the or'd node is to be inverted.  This means we take the
1005          * complement of everything not in the bitmap, but currently we don't
1006          * know what that is, so give up and match anything */
1007         if (ANYOF_NONBITMAP(or_with)) {
1008             cl_anything(pRExC_state, cl);
1009         }
1010         /* We do not use
1011          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1012          *   <= (B1 | !B2) | (CL1 | !CL2)
1013          * which is wasteful if CL2 is small, but we ignore CL2:
1014          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1015          * XXXX Can we handle case-fold?  Unclear:
1016          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1017          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1018          */
1019         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1020              && !(or_with->flags & ANYOF_LOC_FOLD)
1021              && !(cl->flags & ANYOF_LOC_FOLD) ) {
1022             int i;
1023
1024             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1025                 cl->bitmap[i] |= ~or_with->bitmap[i];
1026         } /* XXXX: logic is complicated otherwise */
1027         else {
1028             cl_anything(pRExC_state, cl);
1029         }
1030
1031         /* And, we can just take the union of the flags that aren't affected
1032          * by the inversion */
1033         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1034
1035         /* For the remaining flags:
1036             ANYOF_UNICODE_ALL and inverted means to not match anything above
1037                     255, which means that the union with cl should just be
1038                     what cl has in it, so can ignore this flag
1039             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1040                     is 127-255 to match them, but then invert that, so the
1041                     union with cl should just be what cl has in it, so can
1042                     ignore this flag
1043          */
1044     } else {    /* 'or_with' is not inverted */
1045         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1046         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1047              && (!(or_with->flags & ANYOF_LOC_FOLD)
1048                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1049             int i;
1050
1051             /* OR char bitmap and class bitmap separately */
1052             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1053                 cl->bitmap[i] |= or_with->bitmap[i];
1054             if (or_with->flags & ANYOF_CLASS) {
1055                 ANYOF_CLASS_OR(or_with, cl);
1056             }
1057         }
1058         else { /* XXXX: logic is complicated, leave it along for a moment. */
1059             cl_anything(pRExC_state, cl);
1060         }
1061
1062         if (ANYOF_NONBITMAP(or_with)) {
1063
1064             /* Use the added node's outside-the-bit-map match if there isn't a
1065              * conflict.  If there is a conflict (both nodes match something
1066              * outside the bitmap, but what they match outside is not the same
1067              * pointer, and hence not easily compared until XXX we extend
1068              * inversion lists this far), give up and allow the start class to
1069              * match everything outside the bitmap.  If that stuff is all above
1070              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1071             if (! ANYOF_NONBITMAP(cl)) {
1072                 ARG_SET(cl, ARG(or_with));
1073             }
1074             else if (ARG(cl) != ARG(or_with)) {
1075
1076                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1077                     cl_anything(pRExC_state, cl);
1078                 }
1079                 else {
1080                     cl->flags |= ANYOF_UNICODE_ALL;
1081                 }
1082             }
1083         }
1084
1085         /* Take the union */
1086         cl->flags |= or_with->flags;
1087     }
1088 }
1089
1090 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1091 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1092 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1093 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1094
1095
1096 #ifdef DEBUGGING
1097 /*
1098    dump_trie(trie,widecharmap,revcharmap)
1099    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1100    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1101
1102    These routines dump out a trie in a somewhat readable format.
1103    The _interim_ variants are used for debugging the interim
1104    tables that are used to generate the final compressed
1105    representation which is what dump_trie expects.
1106
1107    Part of the reason for their existence is to provide a form
1108    of documentation as to how the different representations function.
1109
1110 */
1111
1112 /*
1113   Dumps the final compressed table form of the trie to Perl_debug_log.
1114   Used for debugging make_trie().
1115 */
1116
1117 STATIC void
1118 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1119             AV *revcharmap, U32 depth)
1120 {
1121     U32 state;
1122     SV *sv=sv_newmortal();
1123     int colwidth= widecharmap ? 6 : 4;
1124     U16 word;
1125     GET_RE_DEBUG_FLAGS_DECL;
1126
1127     PERL_ARGS_ASSERT_DUMP_TRIE;
1128
1129     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1130         (int)depth * 2 + 2,"",
1131         "Match","Base","Ofs" );
1132
1133     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1134         SV ** const tmp = av_fetch( revcharmap, state, 0);
1135         if ( tmp ) {
1136             PerlIO_printf( Perl_debug_log, "%*s",
1137                 colwidth,
1138                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1139                             PL_colors[0], PL_colors[1],
1140                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1141                             PERL_PV_ESCAPE_FIRSTCHAR
1142                 )
1143             );
1144         }
1145     }
1146     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1147         (int)depth * 2 + 2,"");
1148
1149     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1150         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1151     PerlIO_printf( Perl_debug_log, "\n");
1152
1153     for( state = 1 ; state < trie->statecount ; state++ ) {
1154         const U32 base = trie->states[ state ].trans.base;
1155
1156         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1157
1158         if ( trie->states[ state ].wordnum ) {
1159             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1160         } else {
1161             PerlIO_printf( Perl_debug_log, "%6s", "" );
1162         }
1163
1164         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1165
1166         if ( base ) {
1167             U32 ofs = 0;
1168
1169             while( ( base + ofs  < trie->uniquecharcount ) ||
1170                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1171                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1172                     ofs++;
1173
1174             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1175
1176             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1177                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1178                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1179                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1180                 {
1181                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1182                     colwidth,
1183                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1184                 } else {
1185                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1186                 }
1187             }
1188
1189             PerlIO_printf( Perl_debug_log, "]");
1190
1191         }
1192         PerlIO_printf( Perl_debug_log, "\n" );
1193     }
1194     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1195     for (word=1; word <= trie->wordcount; word++) {
1196         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1197             (int)word, (int)(trie->wordinfo[word].prev),
1198             (int)(trie->wordinfo[word].len));
1199     }
1200     PerlIO_printf(Perl_debug_log, "\n" );
1201 }
1202 /*
1203   Dumps a fully constructed but uncompressed trie in list form.
1204   List tries normally only are used for construction when the number of
1205   possible chars (trie->uniquecharcount) is very high.
1206   Used for debugging make_trie().
1207 */
1208 STATIC void
1209 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1210                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1211                          U32 depth)
1212 {
1213     U32 state;
1214     SV *sv=sv_newmortal();
1215     int colwidth= widecharmap ? 6 : 4;
1216     GET_RE_DEBUG_FLAGS_DECL;
1217
1218     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1219
1220     /* print out the table precompression.  */
1221     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1222         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1223         "------:-----+-----------------\n" );
1224
1225     for( state=1 ; state < next_alloc ; state ++ ) {
1226         U16 charid;
1227
1228         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1229             (int)depth * 2 + 2,"", (UV)state  );
1230         if ( ! trie->states[ state ].wordnum ) {
1231             PerlIO_printf( Perl_debug_log, "%5s| ","");
1232         } else {
1233             PerlIO_printf( Perl_debug_log, "W%4x| ",
1234                 trie->states[ state ].wordnum
1235             );
1236         }
1237         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1238             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1239             if ( tmp ) {
1240                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1241                     colwidth,
1242                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1243                             PL_colors[0], PL_colors[1],
1244                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1245                             PERL_PV_ESCAPE_FIRSTCHAR
1246                     ) ,
1247                     TRIE_LIST_ITEM(state,charid).forid,
1248                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1249                 );
1250                 if (!(charid % 10))
1251                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1252                         (int)((depth * 2) + 14), "");
1253             }
1254         }
1255         PerlIO_printf( Perl_debug_log, "\n");
1256     }
1257 }
1258
1259 /*
1260   Dumps a fully constructed but uncompressed trie in table form.
1261   This is the normal DFA style state transition table, with a few
1262   twists to facilitate compression later.
1263   Used for debugging make_trie().
1264 */
1265 STATIC void
1266 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1267                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1268                           U32 depth)
1269 {
1270     U32 state;
1271     U16 charid;
1272     SV *sv=sv_newmortal();
1273     int colwidth= widecharmap ? 6 : 4;
1274     GET_RE_DEBUG_FLAGS_DECL;
1275
1276     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1277
1278     /*
1279        print out the table precompression so that we can do a visual check
1280        that they are identical.
1281      */
1282
1283     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1284
1285     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1286         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1287         if ( tmp ) {
1288             PerlIO_printf( Perl_debug_log, "%*s",
1289                 colwidth,
1290                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1291                             PL_colors[0], PL_colors[1],
1292                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1293                             PERL_PV_ESCAPE_FIRSTCHAR
1294                 )
1295             );
1296         }
1297     }
1298
1299     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1300
1301     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1302         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1303     }
1304
1305     PerlIO_printf( Perl_debug_log, "\n" );
1306
1307     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1308
1309         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1310             (int)depth * 2 + 2,"",
1311             (UV)TRIE_NODENUM( state ) );
1312
1313         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1314             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1315             if (v)
1316                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1317             else
1318                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1319         }
1320         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1321             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1322         } else {
1323             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1324             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1325         }
1326     }
1327 }
1328
1329 #endif
1330
1331
1332 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1333   startbranch: the first branch in the whole branch sequence
1334   first      : start branch of sequence of branch-exact nodes.
1335                May be the same as startbranch
1336   last       : Thing following the last branch.
1337                May be the same as tail.
1338   tail       : item following the branch sequence
1339   count      : words in the sequence
1340   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1341   depth      : indent depth
1342
1343 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1344
1345 A trie is an N'ary tree where the branches are determined by digital
1346 decomposition of the key. IE, at the root node you look up the 1st character and
1347 follow that branch repeat until you find the end of the branches. Nodes can be
1348 marked as "accepting" meaning they represent a complete word. Eg:
1349
1350   /he|she|his|hers/
1351
1352 would convert into the following structure. Numbers represent states, letters
1353 following numbers represent valid transitions on the letter from that state, if
1354 the number is in square brackets it represents an accepting state, otherwise it
1355 will be in parenthesis.
1356
1357       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1358       |    |
1359       |   (2)
1360       |    |
1361      (1)   +-i->(6)-+-s->[7]
1362       |
1363       +-s->(3)-+-h->(4)-+-e->[5]
1364
1365       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1366
1367 This shows that when matching against the string 'hers' we will begin at state 1
1368 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1369 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1370 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1371 single traverse. We store a mapping from accepting to state to which word was
1372 matched, and then when we have multiple possibilities we try to complete the
1373 rest of the regex in the order in which they occured in the alternation.
1374
1375 The only prior NFA like behaviour that would be changed by the TRIE support is
1376 the silent ignoring of duplicate alternations which are of the form:
1377
1378  / (DUPE|DUPE) X? (?{ ... }) Y /x
1379
1380 Thus EVAL blocks following a trie may be called a different number of times with
1381 and without the optimisation. With the optimisations dupes will be silently
1382 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1383 the following demonstrates:
1384
1385  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1386
1387 which prints out 'word' three times, but
1388
1389  'words'=~/(word|word|word)(?{ print $1 })S/
1390
1391 which doesnt print it out at all. This is due to other optimisations kicking in.
1392
1393 Example of what happens on a structural level:
1394
1395 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1396
1397    1: CURLYM[1] {1,32767}(18)
1398    5:   BRANCH(8)
1399    6:     EXACT <ac>(16)
1400    8:   BRANCH(11)
1401    9:     EXACT <ad>(16)
1402   11:   BRANCH(14)
1403   12:     EXACT <ab>(16)
1404   16:   SUCCEED(0)
1405   17:   NOTHING(18)
1406   18: END(0)
1407
1408 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1409 and should turn into:
1410
1411    1: CURLYM[1] {1,32767}(18)
1412    5:   TRIE(16)
1413         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1414           <ac>
1415           <ad>
1416           <ab>
1417   16:   SUCCEED(0)
1418   17:   NOTHING(18)
1419   18: END(0)
1420
1421 Cases where tail != last would be like /(?foo|bar)baz/:
1422
1423    1: BRANCH(4)
1424    2:   EXACT <foo>(8)
1425    4: BRANCH(7)
1426    5:   EXACT <bar>(8)
1427    7: TAIL(8)
1428    8: EXACT <baz>(10)
1429   10: END(0)
1430
1431 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1432 and would end up looking like:
1433
1434     1: TRIE(8)
1435       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1436         <foo>
1437         <bar>
1438    7: TAIL(8)
1439    8: EXACT <baz>(10)
1440   10: END(0)
1441
1442     d = uvuni_to_utf8_flags(d, uv, 0);
1443
1444 is the recommended Unicode-aware way of saying
1445
1446     *(d++) = uv;
1447 */
1448
1449 #define TRIE_STORE_REVCHAR(val)                                            \
1450     STMT_START {                                                           \
1451         if (UTF) {                                                         \
1452             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1453             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1454             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1455             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1456             SvPOK_on(zlopp);                                               \
1457             SvUTF8_on(zlopp);                                              \
1458             av_push(revcharmap, zlopp);                                    \
1459         } else {                                                           \
1460             char ooooff = (char)val;                                           \
1461             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1462         }                                                                  \
1463         } STMT_END
1464
1465 #define TRIE_READ_CHAR STMT_START {                                                     \
1466     wordlen++;                                                                          \
1467     if ( UTF ) {                                                                        \
1468         /* if it is UTF then it is either already folded, or does not need folding */   \
1469         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1470     }                                                                                   \
1471     else if (folder == PL_fold_latin1) {                                                \
1472         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1473         if ( foldlen > 0 ) {                                                            \
1474            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1475            foldlen -= len;                                                              \
1476            scan += len;                                                                 \
1477            len = 0;                                                                     \
1478         } else {                                                                        \
1479             len = 1;                                                                    \
1480             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);       \
1481             skiplen = UNISKIP(uvc);                                                     \
1482             foldlen -= skiplen;                                                         \
1483             scan = foldbuf + skiplen;                                                   \
1484         }                                                                               \
1485     } else {                                                                            \
1486         /* raw data, will be folded later if needed */                                  \
1487         uvc = (U32)*uc;                                                                 \
1488         len = 1;                                                                        \
1489     }                                                                                   \
1490 } STMT_END
1491
1492
1493
1494 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1495     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1496         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1497         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1498     }                                                           \
1499     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1500     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1501     TRIE_LIST_CUR( state )++;                                   \
1502 } STMT_END
1503
1504 #define TRIE_LIST_NEW(state) STMT_START {                       \
1505     Newxz( trie->states[ state ].trans.list,               \
1506         4, reg_trie_trans_le );                                 \
1507      TRIE_LIST_CUR( state ) = 1;                                \
1508      TRIE_LIST_LEN( state ) = 4;                                \
1509 } STMT_END
1510
1511 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1512     U16 dupe= trie->states[ state ].wordnum;                    \
1513     regnode * const noper_next = regnext( noper );              \
1514                                                                 \
1515     DEBUG_r({                                                   \
1516         /* store the word for dumping */                        \
1517         SV* tmp;                                                \
1518         if (OP(noper) != NOTHING)                               \
1519             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1520         else                                                    \
1521             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1522         av_push( trie_words, tmp );                             \
1523     });                                                         \
1524                                                                 \
1525     curword++;                                                  \
1526     trie->wordinfo[curword].prev   = 0;                         \
1527     trie->wordinfo[curword].len    = wordlen;                   \
1528     trie->wordinfo[curword].accept = state;                     \
1529                                                                 \
1530     if ( noper_next < tail ) {                                  \
1531         if (!trie->jump)                                        \
1532             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1533         trie->jump[curword] = (U16)(noper_next - convert);      \
1534         if (!jumper)                                            \
1535             jumper = noper_next;                                \
1536         if (!nextbranch)                                        \
1537             nextbranch= regnext(cur);                           \
1538     }                                                           \
1539                                                                 \
1540     if ( dupe ) {                                               \
1541         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1542         /* chain, so that when the bits of chain are later    */\
1543         /* linked together, the dups appear in the chain      */\
1544         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1545         trie->wordinfo[dupe].prev = curword;                    \
1546     } else {                                                    \
1547         /* we haven't inserted this word yet.                */ \
1548         trie->states[ state ].wordnum = curword;                \
1549     }                                                           \
1550 } STMT_END
1551
1552
1553 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1554      ( ( base + charid >=  ucharcount                                   \
1555          && base + charid < ubound                                      \
1556          && state == trie->trans[ base - ucharcount + charid ].check    \
1557          && trie->trans[ base - ucharcount + charid ].next )            \
1558            ? trie->trans[ base - ucharcount + charid ].next             \
1559            : ( state==1 ? special : 0 )                                 \
1560       )
1561
1562 #define MADE_TRIE       1
1563 #define MADE_JUMP_TRIE  2
1564 #define MADE_EXACT_TRIE 4
1565
1566 STATIC I32
1567 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1568 {
1569     dVAR;
1570     /* first pass, loop through and scan words */
1571     reg_trie_data *trie;
1572     HV *widecharmap = NULL;
1573     AV *revcharmap = newAV();
1574     regnode *cur;
1575     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1576     STRLEN len = 0;
1577     UV uvc = 0;
1578     U16 curword = 0;
1579     U32 next_alloc = 0;
1580     regnode *jumper = NULL;
1581     regnode *nextbranch = NULL;
1582     regnode *convert = NULL;
1583     U32 *prev_states; /* temp array mapping each state to previous one */
1584     /* we just use folder as a flag in utf8 */
1585     const U8 * folder = NULL;
1586
1587 #ifdef DEBUGGING
1588     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1589     AV *trie_words = NULL;
1590     /* along with revcharmap, this only used during construction but both are
1591      * useful during debugging so we store them in the struct when debugging.
1592      */
1593 #else
1594     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1595     STRLEN trie_charcount=0;
1596 #endif
1597     SV *re_trie_maxbuff;
1598     GET_RE_DEBUG_FLAGS_DECL;
1599
1600     PERL_ARGS_ASSERT_MAKE_TRIE;
1601 #ifndef DEBUGGING
1602     PERL_UNUSED_ARG(depth);
1603 #endif
1604
1605     switch (flags) {
1606         case EXACT: break;
1607         case EXACTFA:
1608         case EXACTFU_SS:
1609         case EXACTFU_TRICKYFOLD:
1610         case EXACTFU: folder = PL_fold_latin1; break;
1611         case EXACTF:  folder = PL_fold; break;
1612         case EXACTFL: folder = PL_fold_locale; break;
1613         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1614     }
1615
1616     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1617     trie->refcount = 1;
1618     trie->startstate = 1;
1619     trie->wordcount = word_count;
1620     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1621     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1622     if (flags == EXACT)
1623         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1624     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1625                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1626
1627     DEBUG_r({
1628         trie_words = newAV();
1629     });
1630
1631     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1632     if (!SvIOK(re_trie_maxbuff)) {
1633         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1634     }
1635     DEBUG_TRIE_COMPILE_r({
1636                 PerlIO_printf( Perl_debug_log,
1637                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1638                   (int)depth * 2 + 2, "",
1639                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1640                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1641                   (int)depth);
1642     });
1643
1644    /* Find the node we are going to overwrite */
1645     if ( first == startbranch && OP( last ) != BRANCH ) {
1646         /* whole branch chain */
1647         convert = first;
1648     } else {
1649         /* branch sub-chain */
1650         convert = NEXTOPER( first );
1651     }
1652
1653     /*  -- First loop and Setup --
1654
1655        We first traverse the branches and scan each word to determine if it
1656        contains widechars, and how many unique chars there are, this is
1657        important as we have to build a table with at least as many columns as we
1658        have unique chars.
1659
1660        We use an array of integers to represent the character codes 0..255
1661        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1662        native representation of the character value as the key and IV's for the
1663        coded index.
1664
1665        *TODO* If we keep track of how many times each character is used we can
1666        remap the columns so that the table compression later on is more
1667        efficient in terms of memory by ensuring the most common value is in the
1668        middle and the least common are on the outside.  IMO this would be better
1669        than a most to least common mapping as theres a decent chance the most
1670        common letter will share a node with the least common, meaning the node
1671        will not be compressible. With a middle is most common approach the worst
1672        case is when we have the least common nodes twice.
1673
1674      */
1675
1676     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1677         regnode *noper = NEXTOPER( cur );
1678         const U8 *uc = (U8*)STRING( noper );
1679         const U8 *e  = uc + STR_LEN( noper );
1680         STRLEN foldlen = 0;
1681         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1682         STRLEN skiplen = 0;
1683         const U8 *scan = (U8*)NULL;
1684         U32 wordlen      = 0;         /* required init */
1685         STRLEN chars = 0;
1686         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1687
1688         if (OP(noper) == NOTHING) {
1689             regnode *noper_next= regnext(noper);
1690             if (noper_next != tail && OP(noper_next) == flags) {
1691                 noper = noper_next;
1692                 uc= (U8*)STRING(noper);
1693                 e= uc + STR_LEN(noper);
1694                 trie->minlen= STR_LEN(noper);
1695             } else {
1696                 trie->minlen= 0;
1697                 continue;
1698             }
1699         }
1700
1701         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1702             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1703                                           regardless of encoding */
1704             if (OP( noper ) == EXACTFU_SS) {
1705                 /* false positives are ok, so just set this */
1706                 TRIE_BITMAP_SET(trie,0xDF);
1707             }
1708         }
1709         for ( ; uc < e ; uc += len ) {
1710             TRIE_CHARCOUNT(trie)++;
1711             TRIE_READ_CHAR;
1712             chars++;
1713             if ( uvc < 256 ) {
1714                 if ( folder ) {
1715                     U8 folded= folder[ (U8) uvc ];
1716                     if ( !trie->charmap[ folded ] ) {
1717                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1718                         TRIE_STORE_REVCHAR( folded );
1719                     }
1720                 }
1721                 if ( !trie->charmap[ uvc ] ) {
1722                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1723                     TRIE_STORE_REVCHAR( uvc );
1724                 }
1725                 if ( set_bit ) {
1726                     /* store the codepoint in the bitmap, and its folded
1727                      * equivalent. */
1728                     TRIE_BITMAP_SET(trie, uvc);
1729
1730                     /* store the folded codepoint */
1731                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1732
1733                     if ( !UTF ) {
1734                         /* store first byte of utf8 representation of
1735                            variant codepoints */
1736                         if (! UNI_IS_INVARIANT(uvc)) {
1737                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1738                         }
1739                     }
1740                     set_bit = 0; /* We've done our bit :-) */
1741                 }
1742             } else {
1743                 SV** svpp;
1744                 if ( !widecharmap )
1745                     widecharmap = newHV();
1746
1747                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1748
1749                 if ( !svpp )
1750                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1751
1752                 if ( !SvTRUE( *svpp ) ) {
1753                     sv_setiv( *svpp, ++trie->uniquecharcount );
1754                     TRIE_STORE_REVCHAR(uvc);
1755                 }
1756             }
1757         }
1758         if( cur == first ) {
1759             trie->minlen = chars;
1760             trie->maxlen = chars;
1761         } else if (chars < trie->minlen) {
1762             trie->minlen = chars;
1763         } else if (chars > trie->maxlen) {
1764             trie->maxlen = chars;
1765         }
1766         if (OP( noper ) == EXACTFU_SS) {
1767             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1768             if (trie->minlen > 1)
1769                 trie->minlen= 1;
1770         }
1771         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1772             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1773              *                - We assume that any such sequence might match a 2 byte string */
1774             if (trie->minlen > 2 )
1775                 trie->minlen= 2;
1776         }
1777
1778     } /* end first pass */
1779     DEBUG_TRIE_COMPILE_r(
1780         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1781                 (int)depth * 2 + 2,"",
1782                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1783                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1784                 (int)trie->minlen, (int)trie->maxlen )
1785     );
1786
1787     /*
1788         We now know what we are dealing with in terms of unique chars and
1789         string sizes so we can calculate how much memory a naive
1790         representation using a flat table  will take. If it's over a reasonable
1791         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1792         conservative but potentially much slower representation using an array
1793         of lists.
1794
1795         At the end we convert both representations into the same compressed
1796         form that will be used in regexec.c for matching with. The latter
1797         is a form that cannot be used to construct with but has memory
1798         properties similar to the list form and access properties similar
1799         to the table form making it both suitable for fast searches and
1800         small enough that its feasable to store for the duration of a program.
1801
1802         See the comment in the code where the compressed table is produced
1803         inplace from the flat tabe representation for an explanation of how
1804         the compression works.
1805
1806     */
1807
1808
1809     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1810     prev_states[1] = 0;
1811
1812     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1813         /*
1814             Second Pass -- Array Of Lists Representation
1815
1816             Each state will be represented by a list of charid:state records
1817             (reg_trie_trans_le) the first such element holds the CUR and LEN
1818             points of the allocated array. (See defines above).
1819
1820             We build the initial structure using the lists, and then convert
1821             it into the compressed table form which allows faster lookups
1822             (but cant be modified once converted).
1823         */
1824
1825         STRLEN transcount = 1;
1826
1827         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1828             "%*sCompiling trie using list compiler\n",
1829             (int)depth * 2 + 2, ""));
1830
1831         trie->states = (reg_trie_state *)
1832             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1833                                   sizeof(reg_trie_state) );
1834         TRIE_LIST_NEW(1);
1835         next_alloc = 2;
1836
1837         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1838
1839             regnode *noper   = NEXTOPER( cur );
1840             U8 *uc           = (U8*)STRING( noper );
1841             const U8 *e      = uc + STR_LEN( noper );
1842             U32 state        = 1;         /* required init */
1843             U16 charid       = 0;         /* sanity init */
1844             U8 *scan         = (U8*)NULL; /* sanity init */
1845             STRLEN foldlen   = 0;         /* required init */
1846             U32 wordlen      = 0;         /* required init */
1847             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1848             STRLEN skiplen   = 0;
1849
1850             if (OP(noper) == NOTHING) {
1851                 regnode *noper_next= regnext(noper);
1852                 if (noper_next != tail && OP(noper_next) == flags) {
1853                     noper = noper_next;
1854                     uc= (U8*)STRING(noper);
1855                     e= uc + STR_LEN(noper);
1856                 }
1857             }
1858
1859             if (OP(noper) != NOTHING) {
1860                 for ( ; uc < e ; uc += len ) {
1861
1862                     TRIE_READ_CHAR;
1863
1864                     if ( uvc < 256 ) {
1865                         charid = trie->charmap[ uvc ];
1866                     } else {
1867                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1868                         if ( !svpp ) {
1869                             charid = 0;
1870                         } else {
1871                             charid=(U16)SvIV( *svpp );
1872                         }
1873                     }
1874                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1875                     if ( charid ) {
1876
1877                         U16 check;
1878                         U32 newstate = 0;
1879
1880                         charid--;
1881                         if ( !trie->states[ state ].trans.list ) {
1882                             TRIE_LIST_NEW( state );
1883                         }
1884                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1885                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1886                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1887                                 break;
1888                             }
1889                         }
1890                         if ( ! newstate ) {
1891                             newstate = next_alloc++;
1892                             prev_states[newstate] = state;
1893                             TRIE_LIST_PUSH( state, charid, newstate );
1894                             transcount++;
1895                         }
1896                         state = newstate;
1897                     } else {
1898                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1899                     }
1900                 }
1901             }
1902             TRIE_HANDLE_WORD(state);
1903
1904         } /* end second pass */
1905
1906         /* next alloc is the NEXT state to be allocated */
1907         trie->statecount = next_alloc;
1908         trie->states = (reg_trie_state *)
1909             PerlMemShared_realloc( trie->states,
1910                                    next_alloc
1911                                    * sizeof(reg_trie_state) );
1912
1913         /* and now dump it out before we compress it */
1914         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1915                                                          revcharmap, next_alloc,
1916                                                          depth+1)
1917         );
1918
1919         trie->trans = (reg_trie_trans *)
1920             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1921         {
1922             U32 state;
1923             U32 tp = 0;
1924             U32 zp = 0;
1925
1926
1927             for( state=1 ; state < next_alloc ; state ++ ) {
1928                 U32 base=0;
1929
1930                 /*
1931                 DEBUG_TRIE_COMPILE_MORE_r(
1932                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1933                 );
1934                 */
1935
1936                 if (trie->states[state].trans.list) {
1937                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1938                     U16 maxid=minid;
1939                     U16 idx;
1940
1941                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1942                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1943                         if ( forid < minid ) {
1944                             minid=forid;
1945                         } else if ( forid > maxid ) {
1946                             maxid=forid;
1947                         }
1948                     }
1949                     if ( transcount < tp + maxid - minid + 1) {
1950                         transcount *= 2;
1951                         trie->trans = (reg_trie_trans *)
1952                             PerlMemShared_realloc( trie->trans,
1953                                                      transcount
1954                                                      * sizeof(reg_trie_trans) );
1955                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1956                     }
1957                     base = trie->uniquecharcount + tp - minid;
1958                     if ( maxid == minid ) {
1959                         U32 set = 0;
1960                         for ( ; zp < tp ; zp++ ) {
1961                             if ( ! trie->trans[ zp ].next ) {
1962                                 base = trie->uniquecharcount + zp - minid;
1963                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1964                                 trie->trans[ zp ].check = state;
1965                                 set = 1;
1966                                 break;
1967                             }
1968                         }
1969                         if ( !set ) {
1970                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1971                             trie->trans[ tp ].check = state;
1972                             tp++;
1973                             zp = tp;
1974                         }
1975                     } else {
1976                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1977                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1978                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1979                             trie->trans[ tid ].check = state;
1980                         }
1981                         tp += ( maxid - minid + 1 );
1982                     }
1983                     Safefree(trie->states[ state ].trans.list);
1984                 }
1985                 /*
1986                 DEBUG_TRIE_COMPILE_MORE_r(
1987                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1988                 );
1989                 */
1990                 trie->states[ state ].trans.base=base;
1991             }
1992             trie->lasttrans = tp + 1;
1993         }
1994     } else {
1995         /*
1996            Second Pass -- Flat Table Representation.
1997
1998            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1999            We know that we will need Charcount+1 trans at most to store the data
2000            (one row per char at worst case) So we preallocate both structures
2001            assuming worst case.
2002
2003            We then construct the trie using only the .next slots of the entry
2004            structs.
2005
2006            We use the .check field of the first entry of the node temporarily to
2007            make compression both faster and easier by keeping track of how many non
2008            zero fields are in the node.
2009
2010            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2011            transition.
2012
2013            There are two terms at use here: state as a TRIE_NODEIDX() which is a
2014            number representing the first entry of the node, and state as a
2015            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2016            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2017            are 2 entrys per node. eg:
2018
2019              A B       A B
2020           1. 2 4    1. 3 7
2021           2. 0 3    3. 0 5
2022           3. 0 0    5. 0 0
2023           4. 0 0    7. 0 0
2024
2025            The table is internally in the right hand, idx form. However as we also
2026            have to deal with the states array which is indexed by nodenum we have to
2027            use TRIE_NODENUM() to convert.
2028
2029         */
2030         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2031             "%*sCompiling trie using table compiler\n",
2032             (int)depth * 2 + 2, ""));
2033
2034         trie->trans = (reg_trie_trans *)
2035             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2036                                   * trie->uniquecharcount + 1,
2037                                   sizeof(reg_trie_trans) );
2038         trie->states = (reg_trie_state *)
2039             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2040                                   sizeof(reg_trie_state) );
2041         next_alloc = trie->uniquecharcount + 1;
2042
2043
2044         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2045
2046             regnode *noper   = NEXTOPER( cur );
2047             const U8 *uc     = (U8*)STRING( noper );
2048             const U8 *e      = uc + STR_LEN( noper );
2049
2050             U32 state        = 1;         /* required init */
2051
2052             U16 charid       = 0;         /* sanity init */
2053             U32 accept_state = 0;         /* sanity init */
2054             U8 *scan         = (U8*)NULL; /* sanity init */
2055
2056             STRLEN foldlen   = 0;         /* required init */
2057             U32 wordlen      = 0;         /* required init */
2058             STRLEN skiplen   = 0;
2059             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2060
2061             if (OP(noper) == NOTHING) {
2062                 regnode *noper_next= regnext(noper);
2063                 if (noper_next != tail && OP(noper_next) == flags) {
2064                     noper = noper_next;
2065                     uc= (U8*)STRING(noper);
2066                     e= uc + STR_LEN(noper);
2067                 }
2068             }
2069
2070             if ( OP(noper) != NOTHING ) {
2071                 for ( ; uc < e ; uc += len ) {
2072
2073                     TRIE_READ_CHAR;
2074
2075                     if ( uvc < 256 ) {
2076                         charid = trie->charmap[ uvc ];
2077                     } else {
2078                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2079                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2080                     }
2081                     if ( charid ) {
2082                         charid--;
2083                         if ( !trie->trans[ state + charid ].next ) {
2084                             trie->trans[ state + charid ].next = next_alloc;
2085                             trie->trans[ state ].check++;
2086                             prev_states[TRIE_NODENUM(next_alloc)]
2087                                     = TRIE_NODENUM(state);
2088                             next_alloc += trie->uniquecharcount;
2089                         }
2090                         state = trie->trans[ state + charid ].next;
2091                     } else {
2092                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2093                     }
2094                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2095                 }
2096             }
2097             accept_state = TRIE_NODENUM( state );
2098             TRIE_HANDLE_WORD(accept_state);
2099
2100         } /* end second pass */
2101
2102         /* and now dump it out before we compress it */
2103         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2104                                                           revcharmap,
2105                                                           next_alloc, depth+1));
2106
2107         {
2108         /*
2109            * Inplace compress the table.*
2110
2111            For sparse data sets the table constructed by the trie algorithm will
2112            be mostly 0/FAIL transitions or to put it another way mostly empty.
2113            (Note that leaf nodes will not contain any transitions.)
2114
2115            This algorithm compresses the tables by eliminating most such
2116            transitions, at the cost of a modest bit of extra work during lookup:
2117
2118            - Each states[] entry contains a .base field which indicates the
2119            index in the state[] array wheres its transition data is stored.
2120
2121            - If .base is 0 there are no valid transitions from that node.
2122
2123            - If .base is nonzero then charid is added to it to find an entry in
2124            the trans array.
2125
2126            -If trans[states[state].base+charid].check!=state then the
2127            transition is taken to be a 0/Fail transition. Thus if there are fail
2128            transitions at the front of the node then the .base offset will point
2129            somewhere inside the previous nodes data (or maybe even into a node
2130            even earlier), but the .check field determines if the transition is
2131            valid.
2132
2133            XXX - wrong maybe?
2134            The following process inplace converts the table to the compressed
2135            table: We first do not compress the root node 1,and mark all its
2136            .check pointers as 1 and set its .base pointer as 1 as well. This
2137            allows us to do a DFA construction from the compressed table later,
2138            and ensures that any .base pointers we calculate later are greater
2139            than 0.
2140
2141            - We set 'pos' to indicate the first entry of the second node.
2142
2143            - We then iterate over the columns of the node, finding the first and
2144            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2145            and set the .check pointers accordingly, and advance pos
2146            appropriately and repreat for the next node. Note that when we copy
2147            the next pointers we have to convert them from the original
2148            NODEIDX form to NODENUM form as the former is not valid post
2149            compression.
2150
2151            - If a node has no transitions used we mark its base as 0 and do not
2152            advance the pos pointer.
2153
2154            - If a node only has one transition we use a second pointer into the
2155            structure to fill in allocated fail transitions from other states.
2156            This pointer is independent of the main pointer and scans forward
2157            looking for null transitions that are allocated to a state. When it
2158            finds one it writes the single transition into the "hole".  If the
2159            pointer doesnt find one the single transition is appended as normal.
2160
2161            - Once compressed we can Renew/realloc the structures to release the
2162            excess space.
2163
2164            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2165            specifically Fig 3.47 and the associated pseudocode.
2166
2167            demq
2168         */
2169         const U32 laststate = TRIE_NODENUM( next_alloc );
2170         U32 state, charid;
2171         U32 pos = 0, zp=0;
2172         trie->statecount = laststate;
2173
2174         for ( state = 1 ; state < laststate ; state++ ) {
2175             U8 flag = 0;
2176             const U32 stateidx = TRIE_NODEIDX( state );
2177             const U32 o_used = trie->trans[ stateidx ].check;
2178             U32 used = trie->trans[ stateidx ].check;
2179             trie->trans[ stateidx ].check = 0;
2180
2181             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2182                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2183                     if ( trie->trans[ stateidx + charid ].next ) {
2184                         if (o_used == 1) {
2185                             for ( ; zp < pos ; zp++ ) {
2186                                 if ( ! trie->trans[ zp ].next ) {
2187                                     break;
2188                                 }
2189                             }
2190                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2191                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2192                             trie->trans[ zp ].check = state;
2193                             if ( ++zp > pos ) pos = zp;
2194                             break;
2195                         }
2196                         used--;
2197                     }
2198                     if ( !flag ) {
2199                         flag = 1;
2200                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2201                     }
2202                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2203                     trie->trans[ pos ].check = state;
2204                     pos++;
2205                 }
2206             }
2207         }
2208         trie->lasttrans = pos + 1;
2209         trie->states = (reg_trie_state *)
2210             PerlMemShared_realloc( trie->states, laststate
2211                                    * sizeof(reg_trie_state) );
2212         DEBUG_TRIE_COMPILE_MORE_r(
2213                 PerlIO_printf( Perl_debug_log,
2214                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2215                     (int)depth * 2 + 2,"",
2216                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2217                     (IV)next_alloc,
2218                     (IV)pos,
2219                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2220             );
2221
2222         } /* end table compress */
2223     }
2224     DEBUG_TRIE_COMPILE_MORE_r(
2225             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2226                 (int)depth * 2 + 2, "",
2227                 (UV)trie->statecount,
2228                 (UV)trie->lasttrans)
2229     );
2230     /* resize the trans array to remove unused space */
2231     trie->trans = (reg_trie_trans *)
2232         PerlMemShared_realloc( trie->trans, trie->lasttrans
2233                                * sizeof(reg_trie_trans) );
2234
2235     {   /* Modify the program and insert the new TRIE node */
2236         U8 nodetype =(U8)(flags & 0xFF);
2237         char *str=NULL;
2238
2239 #ifdef DEBUGGING
2240         regnode *optimize = NULL;
2241 #ifdef RE_TRACK_PATTERN_OFFSETS
2242
2243         U32 mjd_offset = 0;
2244         U32 mjd_nodelen = 0;
2245 #endif /* RE_TRACK_PATTERN_OFFSETS */
2246 #endif /* DEBUGGING */
2247         /*
2248            This means we convert either the first branch or the first Exact,
2249            depending on whether the thing following (in 'last') is a branch
2250            or not and whther first is the startbranch (ie is it a sub part of
2251            the alternation or is it the whole thing.)
2252            Assuming its a sub part we convert the EXACT otherwise we convert
2253            the whole branch sequence, including the first.
2254          */
2255         /* Find the node we are going to overwrite */
2256         if ( first != startbranch || OP( last ) == BRANCH ) {
2257             /* branch sub-chain */
2258             NEXT_OFF( first ) = (U16)(last - first);
2259 #ifdef RE_TRACK_PATTERN_OFFSETS
2260             DEBUG_r({
2261                 mjd_offset= Node_Offset((convert));
2262                 mjd_nodelen= Node_Length((convert));
2263             });
2264 #endif
2265             /* whole branch chain */
2266         }
2267 #ifdef RE_TRACK_PATTERN_OFFSETS
2268         else {
2269             DEBUG_r({
2270                 const  regnode *nop = NEXTOPER( convert );
2271                 mjd_offset= Node_Offset((nop));
2272                 mjd_nodelen= Node_Length((nop));
2273             });
2274         }
2275         DEBUG_OPTIMISE_r(
2276             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2277                 (int)depth * 2 + 2, "",
2278                 (UV)mjd_offset, (UV)mjd_nodelen)
2279         );
2280 #endif
2281         /* But first we check to see if there is a common prefix we can
2282            split out as an EXACT and put in front of the TRIE node.  */
2283         trie->startstate= 1;
2284         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2285             U32 state;
2286             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2287                 U32 ofs = 0;
2288                 I32 idx = -1;
2289                 U32 count = 0;
2290                 const U32 base = trie->states[ state ].trans.base;
2291
2292                 if ( trie->states[state].wordnum )
2293                         count = 1;
2294
2295                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2296                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2297                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2298                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2299                     {
2300                         if ( ++count > 1 ) {
2301                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2302                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2303                             if ( state == 1 ) break;
2304                             if ( count == 2 ) {
2305                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2306                                 DEBUG_OPTIMISE_r(
2307                                     PerlIO_printf(Perl_debug_log,
2308                                         "%*sNew Start State=%"UVuf" Class: [",
2309                                         (int)depth * 2 + 2, "",
2310                                         (UV)state));
2311                                 if (idx >= 0) {
2312                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2313                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2314
2315                                     TRIE_BITMAP_SET(trie,*ch);
2316                                     if ( folder )
2317                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2318                                     DEBUG_OPTIMISE_r(
2319                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2320                                     );
2321                                 }
2322                             }
2323                             TRIE_BITMAP_SET(trie,*ch);
2324                             if ( folder )
2325                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2326                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2327                         }
2328                         idx = ofs;
2329                     }
2330                 }
2331                 if ( count == 1 ) {
2332                     SV **tmp = av_fetch( revcharmap, idx, 0);
2333                     STRLEN len;
2334                     char *ch = SvPV( *tmp, len );
2335                     DEBUG_OPTIMISE_r({
2336                         SV *sv=sv_newmortal();
2337                         PerlIO_printf( Perl_debug_log,
2338                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2339                             (int)depth * 2 + 2, "",
2340                             (UV)state, (UV)idx,
2341                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2342                                 PL_colors[0], PL_colors[1],
2343                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2344                                 PERL_PV_ESCAPE_FIRSTCHAR
2345                             )
2346                         );
2347                     });
2348                     if ( state==1 ) {
2349                         OP( convert ) = nodetype;
2350                         str=STRING(convert);
2351                         STR_LEN(convert)=0;
2352                     }
2353                     STR_LEN(convert) += len;
2354                     while (len--)
2355                         *str++ = *ch++;
2356                 } else {
2357 #ifdef DEBUGGING
2358                     if (state>1)
2359                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2360 #endif
2361                     break;
2362                 }
2363             }
2364             trie->prefixlen = (state-1);
2365             if (str) {
2366                 regnode *n = convert+NODE_SZ_STR(convert);
2367                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2368                 trie->startstate = state;
2369                 trie->minlen -= (state - 1);
2370                 trie->maxlen -= (state - 1);
2371 #ifdef DEBUGGING
2372                /* At least the UNICOS C compiler choked on this
2373                 * being argument to DEBUG_r(), so let's just have
2374                 * it right here. */
2375                if (
2376 #ifdef PERL_EXT_RE_BUILD
2377                    1
2378 #else
2379                    DEBUG_r_TEST
2380 #endif
2381                    ) {
2382                    regnode *fix = convert;
2383                    U32 word = trie->wordcount;
2384                    mjd_nodelen++;
2385                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2386                    while( ++fix < n ) {
2387                        Set_Node_Offset_Length(fix, 0, 0);
2388                    }
2389                    while (word--) {
2390                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2391                        if (tmp) {
2392                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2393                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2394                            else
2395                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2396                        }
2397                    }
2398                }
2399 #endif
2400                 if (trie->maxlen) {
2401                     convert = n;
2402                 } else {
2403                     NEXT_OFF(convert) = (U16)(tail - convert);
2404                     DEBUG_r(optimize= n);
2405                 }
2406             }
2407         }
2408         if (!jumper)
2409             jumper = last;
2410         if ( trie->maxlen ) {
2411             NEXT_OFF( convert ) = (U16)(tail - convert);
2412             ARG_SET( convert, data_slot );
2413             /* Store the offset to the first unabsorbed branch in
2414                jump[0], which is otherwise unused by the jump logic.
2415                We use this when dumping a trie and during optimisation. */
2416             if (trie->jump)
2417                 trie->jump[0] = (U16)(nextbranch - convert);
2418
2419             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2420              *   and there is a bitmap
2421              *   and the first "jump target" node we found leaves enough room
2422              * then convert the TRIE node into a TRIEC node, with the bitmap
2423              * embedded inline in the opcode - this is hypothetically faster.
2424              */
2425             if ( !trie->states[trie->startstate].wordnum
2426                  && trie->bitmap
2427                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2428             {
2429                 OP( convert ) = TRIEC;
2430                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2431                 PerlMemShared_free(trie->bitmap);
2432                 trie->bitmap= NULL;
2433             } else
2434                 OP( convert ) = TRIE;
2435
2436             /* store the type in the flags */
2437             convert->flags = nodetype;
2438             DEBUG_r({
2439             optimize = convert
2440                       + NODE_STEP_REGNODE
2441                       + regarglen[ OP( convert ) ];
2442             });
2443             /* XXX We really should free up the resource in trie now,
2444                    as we won't use them - (which resources?) dmq */
2445         }
2446         /* needed for dumping*/
2447         DEBUG_r(if (optimize) {
2448             regnode *opt = convert;
2449
2450             while ( ++opt < optimize) {
2451                 Set_Node_Offset_Length(opt,0,0);
2452             }
2453             /*
2454                 Try to clean up some of the debris left after the
2455                 optimisation.
2456              */
2457             while( optimize < jumper ) {
2458                 mjd_nodelen += Node_Length((optimize));
2459                 OP( optimize ) = OPTIMIZED;
2460                 Set_Node_Offset_Length(optimize,0,0);
2461                 optimize++;
2462             }
2463             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2464         });
2465     } /* end node insert */
2466
2467     /*  Finish populating the prev field of the wordinfo array.  Walk back
2468      *  from each accept state until we find another accept state, and if
2469      *  so, point the first word's .prev field at the second word. If the
2470      *  second already has a .prev field set, stop now. This will be the
2471      *  case either if we've already processed that word's accept state,
2472      *  or that state had multiple words, and the overspill words were
2473      *  already linked up earlier.
2474      */
2475     {
2476         U16 word;
2477         U32 state;
2478         U16 prev;
2479
2480         for (word=1; word <= trie->wordcount; word++) {
2481             prev = 0;
2482             if (trie->wordinfo[word].prev)
2483                 continue;
2484             state = trie->wordinfo[word].accept;
2485             while (state) {
2486                 state = prev_states[state];
2487                 if (!state)
2488                     break;
2489                 prev = trie->states[state].wordnum;
2490                 if (prev)
2491                     break;
2492             }
2493             trie->wordinfo[word].prev = prev;
2494         }
2495         Safefree(prev_states);
2496     }
2497
2498
2499     /* and now dump out the compressed format */
2500     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2501
2502     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2503 #ifdef DEBUGGING
2504     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2505     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2506 #else
2507     SvREFCNT_dec_NN(revcharmap);
2508 #endif
2509     return trie->jump
2510            ? MADE_JUMP_TRIE
2511            : trie->startstate>1
2512              ? MADE_EXACT_TRIE
2513              : MADE_TRIE;
2514 }
2515
2516 STATIC void
2517 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2518 {
2519 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2520
2521    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2522    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2523    ISBN 0-201-10088-6
2524
2525    We find the fail state for each state in the trie, this state is the longest proper
2526    suffix of the current state's 'word' that is also a proper prefix of another word in our
2527    trie. State 1 represents the word '' and is thus the default fail state. This allows
2528    the DFA not to have to restart after its tried and failed a word at a given point, it
2529    simply continues as though it had been matching the other word in the first place.
2530    Consider
2531       'abcdgu'=~/abcdefg|cdgu/
2532    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2533    fail, which would bring us to the state representing 'd' in the second word where we would
2534    try 'g' and succeed, proceeding to match 'cdgu'.
2535  */
2536  /* add a fail transition */
2537     const U32 trie_offset = ARG(source);
2538     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2539     U32 *q;
2540     const U32 ucharcount = trie->uniquecharcount;
2541     const U32 numstates = trie->statecount;
2542     const U32 ubound = trie->lasttrans + ucharcount;
2543     U32 q_read = 0;
2544     U32 q_write = 0;
2545     U32 charid;
2546     U32 base = trie->states[ 1 ].trans.base;
2547     U32 *fail;
2548     reg_ac_data *aho;
2549     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2550     GET_RE_DEBUG_FLAGS_DECL;
2551
2552     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2553 #ifndef DEBUGGING
2554     PERL_UNUSED_ARG(depth);
2555 #endif
2556
2557
2558     ARG_SET( stclass, data_slot );
2559     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2560     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2561     aho->trie=trie_offset;
2562     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2563     Copy( trie->states, aho->states, numstates, reg_trie_state );
2564     Newxz( q, numstates, U32);
2565     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2566     aho->refcount = 1;
2567     fail = aho->fail;
2568     /* initialize fail[0..1] to be 1 so that we always have
2569        a valid final fail state */
2570     fail[ 0 ] = fail[ 1 ] = 1;
2571
2572     for ( charid = 0; charid < ucharcount ; charid++ ) {
2573         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2574         if ( newstate ) {
2575             q[ q_write ] = newstate;
2576             /* set to point at the root */
2577             fail[ q[ q_write++ ] ]=1;
2578         }
2579     }
2580     while ( q_read < q_write) {
2581         const U32 cur = q[ q_read++ % numstates ];
2582         base = trie->states[ cur ].trans.base;
2583
2584         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2585             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2586             if (ch_state) {
2587                 U32 fail_state = cur;
2588                 U32 fail_base;
2589                 do {
2590                     fail_state = fail[ fail_state ];
2591                     fail_base = aho->states[ fail_state ].trans.base;
2592                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2593
2594                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2595                 fail[ ch_state ] = fail_state;
2596                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2597                 {
2598                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2599                 }
2600                 q[ q_write++ % numstates] = ch_state;
2601             }
2602         }
2603     }
2604     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2605        when we fail in state 1, this allows us to use the
2606        charclass scan to find a valid start char. This is based on the principle
2607        that theres a good chance the string being searched contains lots of stuff
2608        that cant be a start char.
2609      */
2610     fail[ 0 ] = fail[ 1 ] = 0;
2611     DEBUG_TRIE_COMPILE_r({
2612         PerlIO_printf(Perl_debug_log,
2613                       "%*sStclass Failtable (%"UVuf" states): 0",
2614                       (int)(depth * 2), "", (UV)numstates
2615         );
2616         for( q_read=1; q_read<numstates; q_read++ ) {
2617             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2618         }
2619         PerlIO_printf(Perl_debug_log, "\n");
2620     });
2621     Safefree(q);
2622     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2623 }
2624
2625
2626 /*
2627  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2628  * These need to be revisited when a newer toolchain becomes available.
2629  */
2630 #if defined(__sparc64__) && defined(__GNUC__)
2631 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2632 #       undef  SPARC64_GCC_WORKAROUND
2633 #       define SPARC64_GCC_WORKAROUND 1
2634 #   endif
2635 #endif
2636
2637 #define DEBUG_PEEP(str,scan,depth) \
2638     DEBUG_OPTIMISE_r({if (scan){ \
2639        SV * const mysv=sv_newmortal(); \
2640        regnode *Next = regnext(scan); \
2641        regprop(RExC_rx, mysv, scan); \
2642        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2643        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2644        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2645    }});
2646
2647
2648 /* The below joins as many adjacent EXACTish nodes as possible into a single
2649  * one.  The regop may be changed if the node(s) contain certain sequences that
2650  * require special handling.  The joining is only done if:
2651  * 1) there is room in the current conglomerated node to entirely contain the
2652  *    next one.
2653  * 2) they are the exact same node type
2654  *
2655  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2656  * these get optimized out
2657  *
2658  * If a node is to match under /i (folded), the number of characters it matches
2659  * can be different than its character length if it contains a multi-character
2660  * fold.  *min_subtract is set to the total delta of the input nodes.
2661  *
2662  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2663  * and contains LATIN SMALL LETTER SHARP S
2664  *
2665  * This is as good a place as any to discuss the design of handling these
2666  * multi-character fold sequences.  It's been wrong in Perl for a very long
2667  * time.  There are three code points in Unicode whose multi-character folds
2668  * were long ago discovered to mess things up.  The previous designs for
2669  * dealing with these involved assigning a special node for them.  This
2670  * approach doesn't work, as evidenced by this example:
2671  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2672  * Both these fold to "sss", but if the pattern is parsed to create a node that
2673  * would match just the \xDF, it won't be able to handle the case where a
2674  * successful match would have to cross the node's boundary.  The new approach
2675  * that hopefully generally solves the problem generates an EXACTFU_SS node
2676  * that is "sss".
2677  *
2678  * It turns out that there are problems with all multi-character folds, and not
2679  * just these three.  Now the code is general, for all such cases, but the
2680  * three still have some special handling.  The approach taken is:
2681  * 1)   This routine examines each EXACTFish node that could contain multi-
2682  *      character fold sequences.  It returns in *min_subtract how much to
2683  *      subtract from the the actual length of the string to get a real minimum
2684  *      match length; it is 0 if there are no multi-char folds.  This delta is
2685  *      used by the caller to adjust the min length of the match, and the delta
2686  *      between min and max, so that the optimizer doesn't reject these
2687  *      possibilities based on size constraints.
2688  * 2)   Certain of these sequences require special handling by the trie code,
2689  *      so, if found, this code changes the joined node type to special ops:
2690  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2691  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2692  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2693  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2694  *      there is a possible fold length change.  That means that a regular
2695  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2696  *      with length changes, and so can be processed faster.  regexec.c takes
2697  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2698  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2699  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2700  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2701  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2702  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2703  *      possibilities for the non-UTF8 patterns are quite simple, except for
2704  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2705  *      members of a fold-pair, and arrays are set up for all of them so that
2706  *      the other member of the pair can be found quickly.  Code elsewhere in
2707  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2708  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2709  *      described in the next item.
2710  * 4)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2711  *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2712  *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
2713  *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
2714  *      character in the pattern corresponds to at most a single character in
2715  *      the target string.  (And I do mean character, and not byte here, unlike
2716  *      other parts of the documentation that have never been updated to
2717  *      account for multibyte Unicode.)  sharp s in EXACTF nodes can match the
2718  *      two character string 'ss'; in EXACTFA nodes it can match
2719  *      "\x{17F}\x{17F}".  These violate the assumption, and they are the only
2720  *      instances where it is violated.  I'm reluctant to try to change the
2721  *      assumption, as the code involved is impenetrable to me (khw), so
2722  *      instead the code here punts.  This routine examines (when the pattern
2723  *      isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2724  *      boolean indicating whether or not the node contains a sharp s.  When it
2725  *      is true, the caller sets a flag that later causes the optimizer in this
2726  *      file to not set values for the floating and fixed string lengths, and
2727  *      thus avoids the optimizer code in regexec.c that makes the invalid
2728  *      assumption.  Thus, there is no optimization based on string lengths for
2729  *      non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2730  *      (The reason the assumption is wrong only in these two cases is that all
2731  *      other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2732  *      other folds to their expanded versions.  We can't prefold sharp s to
2733  *      'ss' in EXACTF nodes because we don't know at compile time if it
2734  *      actually matches 'ss' or not.  It will match iff the target string is
2735  *      in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2736  *      EXACTFA and EXACTFL where it never does.  In an EXACTFA node in a UTF-8
2737  *      pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2738  *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2739  *      require the pattern to be forced into UTF-8, the overhead of which we
2740  *      want to avoid.)
2741  */
2742
2743 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2744     if (PL_regkind[OP(scan)] == EXACT) \
2745         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2746
2747 STATIC U32
2748 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2749     /* Merge several consecutive EXACTish nodes into one. */
2750     regnode *n = regnext(scan);
2751     U32 stringok = 1;
2752     regnode *next = scan + NODE_SZ_STR(scan);
2753     U32 merged = 0;
2754     U32 stopnow = 0;
2755 #ifdef DEBUGGING
2756     regnode *stop = scan;
2757     GET_RE_DEBUG_FLAGS_DECL;
2758 #else
2759     PERL_UNUSED_ARG(depth);
2760 #endif
2761
2762     PERL_ARGS_ASSERT_JOIN_EXACT;
2763 #ifndef EXPERIMENTAL_INPLACESCAN
2764     PERL_UNUSED_ARG(flags);
2765     PERL_UNUSED_ARG(val);
2766 #endif
2767     DEBUG_PEEP("join",scan,depth);
2768
2769     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2770      * EXACT ones that are mergeable to the current one. */
2771     while (n
2772            && (PL_regkind[OP(n)] == NOTHING
2773                || (stringok && OP(n) == OP(scan)))
2774            && NEXT_OFF(n)
2775            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2776     {
2777
2778         if (OP(n) == TAIL || n > next)
2779             stringok = 0;
2780         if (PL_regkind[OP(n)] == NOTHING) {
2781             DEBUG_PEEP("skip:",n,depth);
2782             NEXT_OFF(scan) += NEXT_OFF(n);
2783             next = n + NODE_STEP_REGNODE;
2784 #ifdef DEBUGGING
2785             if (stringok)
2786                 stop = n;
2787 #endif
2788             n = regnext(n);
2789         }
2790         else if (stringok) {
2791             const unsigned int oldl = STR_LEN(scan);
2792             regnode * const nnext = regnext(n);
2793
2794             /* XXX I (khw) kind of doubt that this works on platforms where
2795              * U8_MAX is above 255 because of lots of other assumptions */
2796             /* Don't join if the sum can't fit into a single node */
2797             if (oldl + STR_LEN(n) > U8_MAX)
2798                 break;
2799
2800             DEBUG_PEEP("merg",n,depth);
2801             merged++;
2802
2803             NEXT_OFF(scan) += NEXT_OFF(n);
2804             STR_LEN(scan) += STR_LEN(n);
2805             next = n + NODE_SZ_STR(n);
2806             /* Now we can overwrite *n : */
2807             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2808 #ifdef DEBUGGING
2809             stop = next - 1;
2810 #endif
2811             n = nnext;
2812             if (stopnow) break;
2813         }
2814
2815 #ifdef EXPERIMENTAL_INPLACESCAN
2816         if (flags && !NEXT_OFF(n)) {
2817             DEBUG_PEEP("atch", val, depth);
2818             if (reg_off_by_arg[OP(n)]) {
2819                 ARG_SET(n, val - n);
2820             }
2821             else {
2822                 NEXT_OFF(n) = val - n;
2823             }
2824             stopnow = 1;
2825         }
2826 #endif
2827     }
2828
2829     *min_subtract = 0;
2830     *has_exactf_sharp_s = FALSE;
2831
2832     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2833      * can now analyze for sequences of problematic code points.  (Prior to
2834      * this final joining, sequences could have been split over boundaries, and
2835      * hence missed).  The sequences only happen in folding, hence for any
2836      * non-EXACT EXACTish node */
2837     if (OP(scan) != EXACT) {
2838         const U8 * const s0 = (U8*) STRING(scan);
2839         const U8 * s = s0;
2840         const U8 * const s_end = s0 + STR_LEN(scan);
2841
2842         /* One pass is made over the node's string looking for all the
2843          * possibilities.  to avoid some tests in the loop, there are two main
2844          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2845          * non-UTF-8 */
2846         if (UTF) {
2847
2848             /* Examine the string for a multi-character fold sequence.  UTF-8
2849              * patterns have all characters pre-folded by the time this code is
2850              * executed */
2851             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2852                                      length sequence we are looking for is 2 */
2853             {
2854                 int count = 0;
2855                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2856                 if (! len) {    /* Not a multi-char fold: get next char */
2857                     s += UTF8SKIP(s);
2858                     continue;
2859                 }
2860
2861                 /* Nodes with 'ss' require special handling, except for EXACTFL
2862                  * and EXACTFA for which there is no multi-char fold to this */
2863                 if (len == 2 && *s == 's' && *(s+1) == 's'
2864                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2865                 {
2866                     count = 2;
2867                     OP(scan) = EXACTFU_SS;
2868                     s += 2;
2869                 }
2870                 else if (len == 6   /* len is the same in both ASCII and EBCDIC
2871                                        for these */
2872                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2873                                       COMBINING_DIAERESIS_UTF8
2874                                       COMBINING_ACUTE_ACCENT_UTF8,
2875                                    6)
2876                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2877                                          COMBINING_DIAERESIS_UTF8
2878                                          COMBINING_ACUTE_ACCENT_UTF8,
2879                                      6)))
2880                 {
2881                     count = 3;
2882
2883                     /* These two folds require special handling by trie's, so
2884                      * change the node type to indicate this.  If EXACTFA and
2885                      * EXACTFL were ever to be handled by trie's, this would
2886                      * have to be changed.  If this node has already been
2887                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2888                      * (khw) think it doesn't matter in regexec.c for UTF
2889                      * patterns, but no need to change it */
2890                     if (OP(scan) == EXACTFU) {
2891                         OP(scan) = EXACTFU_TRICKYFOLD;
2892                     }
2893                     s += 6;
2894                 }
2895                 else { /* Here is a generic multi-char fold. */
2896                     const U8* multi_end  = s + len;
2897
2898                     /* Count how many characters in it.  In the case of /l and
2899                      * /aa, no folds which contain ASCII code points are
2900                      * allowed, so check for those, and skip if found.  (In
2901                      * EXACTFL, no folds are allowed to any Latin1 code point,
2902                      * not just ASCII.  But there aren't any of these
2903                      * currently, nor ever likely, so don't take the time to
2904                      * test for them.  The code that generates the
2905                      * is_MULTI_foo() macros croaks should one actually get put
2906                      * into Unicode .) */
2907                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2908                         count = utf8_length(s, multi_end);
2909                         s = multi_end;
2910                     }
2911                     else {
2912                         while (s < multi_end) {
2913                             if (isASCII(*s)) {
2914                                 s++;
2915                                 goto next_iteration;
2916                             }
2917                             else {
2918                                 s += UTF8SKIP(s);
2919                             }
2920                             count++;
2921                         }
2922                     }
2923                 }
2924
2925                 /* The delta is how long the sequence is minus 1 (1 is how long
2926                  * the character that folds to the sequence is) */
2927                 *min_subtract += count - 1;
2928             next_iteration: ;
2929             }
2930         }
2931         else if (OP(scan) == EXACTFA) {
2932
2933             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
2934              * fold to the ASCII range (and there are no existing ones in the
2935              * upper latin1 range).  But, as outlined in the comments preceding
2936              * this function, we need to flag any occurrences of the sharp s */
2937             while (s < s_end) {
2938                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
2939                     *has_exactf_sharp_s = TRUE;
2940                     break;
2941                 }
2942                 s++;
2943                 continue;
2944             }
2945         }
2946         else if (OP(scan) != EXACTFL) {
2947
2948             /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node.  Look for the
2949              * multi-char folds that are all Latin1.  (This code knows that
2950              * there are no current multi-char folds possible with EXACTFL,
2951              * relying on fold_grind.t to catch any errors if the very unlikely
2952              * event happens that some get added in future Unicode versions.)
2953              * As explained in the comments preceding this function, we look
2954              * also for the sharp s in EXACTF nodes; it can be in the final
2955              * position.  Otherwise we can stop looking 1 byte earlier because
2956              * have to find at least two characters for a multi-fold */
2957             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2958
2959             while (s < upper) {
2960                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2961                 if (! len) {    /* Not a multi-char fold. */
2962                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2963                     {
2964                         *has_exactf_sharp_s = TRUE;
2965                     }
2966                     s++;
2967                     continue;
2968                 }
2969
2970                 if (len == 2
2971                     && isARG2_lower_or_UPPER_ARG1('s', *s)
2972                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
2973                 {
2974
2975                     /* EXACTF nodes need to know that the minimum length
2976                      * changed so that a sharp s in the string can match this
2977                      * ss in the pattern, but they remain EXACTF nodes, as they
2978                      * won't match this unless the target string is is UTF-8,
2979                      * which we don't know until runtime */
2980                     if (OP(scan) != EXACTF) {
2981                         OP(scan) = EXACTFU_SS;
2982                     }
2983                 }
2984
2985                 *min_subtract += len - 1;
2986                 s += len;
2987             }
2988         }
2989     }
2990
2991 #ifdef DEBUGGING
2992     /* Allow dumping but overwriting the collection of skipped
2993      * ops and/or strings with fake optimized ops */
2994     n = scan + NODE_SZ_STR(scan);
2995     while (n <= stop) {
2996         OP(n) = OPTIMIZED;
2997         FLAGS(n) = 0;
2998         NEXT_OFF(n) = 0;
2999         n++;
3000     }
3001 #endif
3002     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3003     return stopnow;
3004 }
3005
3006 /* REx optimizer.  Converts nodes into quicker variants "in place".
3007    Finds fixed substrings.  */
3008
3009 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3010    to the position after last scanned or to NULL. */
3011
3012 #define INIT_AND_WITHP \
3013     assert(!and_withp); \
3014     Newx(and_withp,1,struct regnode_charclass_class); \
3015     SAVEFREEPV(and_withp)
3016
3017 /* this is a chain of data about sub patterns we are processing that
3018    need to be handled separately/specially in study_chunk. Its so
3019    we can simulate recursion without losing state.  */
3020 struct scan_frame;
3021 typedef struct scan_frame {
3022     regnode *last;  /* last node to process in this frame */
3023     regnode *next;  /* next node to process when last is reached */
3024     struct scan_frame *prev; /*previous frame*/
3025     I32 stop; /* what stopparen do we use */
3026 } scan_frame;
3027
3028
3029 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3030
3031 STATIC I32
3032 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3033                         I32 *minlenp, I32 *deltap,
3034                         regnode *last,
3035                         scan_data_t *data,
3036                         I32 stopparen,
3037                         U8* recursed,
3038                         struct regnode_charclass_class *and_withp,
3039                         U32 flags, U32 depth)
3040                         /* scanp: Start here (read-write). */
3041                         /* deltap: Write maxlen-minlen here. */
3042                         /* last: Stop before this one. */
3043                         /* data: string data about the pattern */
3044                         /* stopparen: treat close N as END */
3045                         /* recursed: which subroutines have we recursed into */
3046                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3047 {
3048     dVAR;
3049     I32 min = 0;    /* There must be at least this number of characters to match */
3050     I32 pars = 0, code;
3051     regnode *scan = *scanp, *next;
3052     I32 delta = 0;
3053     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3054     int is_inf_internal = 0;            /* The studied chunk is infinite */
3055     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3056     scan_data_t data_fake;
3057     SV *re_trie_maxbuff = NULL;
3058     regnode *first_non_open = scan;
3059     I32 stopmin = I32_MAX;
3060     scan_frame *frame = NULL;
3061     GET_RE_DEBUG_FLAGS_DECL;
3062
3063     PERL_ARGS_ASSERT_STUDY_CHUNK;
3064
3065 #ifdef DEBUGGING
3066     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3067 #endif
3068
3069     if ( depth == 0 ) {
3070         while (first_non_open && OP(first_non_open) == OPEN)
3071             first_non_open=regnext(first_non_open);
3072     }
3073
3074
3075   fake_study_recurse:
3076     while ( scan && OP(scan) != END && scan < last ){
3077         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3078                                    node length to get a real minimum (because
3079                                    the folded version may be shorter) */
3080         bool has_exactf_sharp_s = FALSE;
3081         /* Peephole optimizer: */
3082         DEBUG_STUDYDATA("Peep:", data,depth);
3083         DEBUG_PEEP("Peep",scan,depth);
3084
3085         /* Its not clear to khw or hv why this is done here, and not in the
3086          * clauses that deal with EXACT nodes.  khw's guess is that it's
3087          * because of a previous design */
3088         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3089
3090         /* Follow the next-chain of the current node and optimize
3091            away all the NOTHINGs from it.  */
3092         if (OP(scan) != CURLYX) {
3093             const int max = (reg_off_by_arg[OP(scan)]
3094                        ? I32_MAX
3095                        /* I32 may be smaller than U16 on CRAYs! */
3096                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3097             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3098             int noff;
3099             regnode *n = scan;
3100
3101             /* Skip NOTHING and LONGJMP. */
3102             while ((n = regnext(n))
3103                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3104                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3105                    && off + noff < max)
3106                 off += noff;
3107             if (reg_off_by_arg[OP(scan)])
3108                 ARG(scan) = off;
3109             else
3110                 NEXT_OFF(scan) = off;
3111         }
3112
3113
3114
3115         /* The principal pseudo-switch.  Cannot be a switch, since we
3116            look into several different things.  */
3117         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3118                    || OP(scan) == IFTHEN) {
3119             next = regnext(scan);
3120             code = OP(scan);
3121             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3122
3123             if (OP(next) == code || code == IFTHEN) {
3124                 /* NOTE - There is similar code to this block below for handling
3125                    TRIE nodes on a re-study.  If you change stuff here check there
3126                    too. */
3127                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3128                 struct regnode_charclass_class accum;
3129                 regnode * const startbranch=scan;
3130
3131                 if (flags & SCF_DO_SUBSTR)
3132                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3133                 if (flags & SCF_DO_STCLASS)
3134                     cl_init_zero(pRExC_state, &accum);
3135
3136                 while (OP(scan) == code) {
3137                     I32 deltanext, minnext, f = 0, fake;
3138                     struct regnode_charclass_class this_class;
3139
3140                     num++;
3141                     data_fake.flags = 0;
3142                     if (data) {
3143                         data_fake.whilem_c = data->whilem_c;
3144                         data_fake.last_closep = data->last_closep;
3145                     }
3146                     else
3147                         data_fake.last_closep = &fake;
3148
3149                     data_fake.pos_delta = delta;
3150                     next = regnext(scan);
3151                     scan = NEXTOPER(scan);
3152                     if (code != BRANCH)
3153                         scan = NEXTOPER(scan);
3154                     if (flags & SCF_DO_STCLASS) {
3155                         cl_init(pRExC_state, &this_class);
3156                         data_fake.start_class = &this_class;
3157                         f = SCF_DO_STCLASS_AND;
3158                     }
3159                     if (flags & SCF_WHILEM_VISITED_POS)
3160                         f |= SCF_WHILEM_VISITED_POS;
3161
3162                     /* we suppose the run is continuous, last=next...*/
3163                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3164                                           next, &data_fake,
3165                                           stopparen, recursed, NULL, f,depth+1);
3166                     if (min1 > minnext)
3167                         min1 = minnext;
3168                     if (deltanext == I32_MAX) {
3169                         is_inf = is_inf_internal = 1;
3170                         max1 = I32_MAX;
3171                     } else if (max1 < minnext + deltanext)
3172                         max1 = minnext + deltanext;
3173                     scan = next;
3174                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3175                         pars++;
3176                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3177                         if ( stopmin > minnext)
3178                             stopmin = min + min1;
3179                         flags &= ~SCF_DO_SUBSTR;
3180                         if (data)
3181                             data->flags |= SCF_SEEN_ACCEPT;
3182                     }
3183                     if (data) {
3184                         if (data_fake.flags & SF_HAS_EVAL)
3185                             data->flags |= SF_HAS_EVAL;
3186                         data->whilem_c = data_fake.whilem_c;
3187                     }
3188                     if (flags & SCF_DO_STCLASS)
3189                         cl_or(pRExC_state, &accum, &this_class);
3190                 }
3191                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3192                     min1 = 0;
3193                 if (flags & SCF_DO_SUBSTR) {
3194                     data->pos_min += min1;
3195                     if (data->pos_delta >= I32_MAX - (max1 - min1))
3196                         data->pos_delta = I32_MAX;
3197                     else
3198                         data->pos_delta += max1 - min1;
3199                     if (max1 != min1 || is_inf)
3200                         data->longest = &(data->longest_float);
3201                 }
3202                 min += min1;
3203                 if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3204                     delta = I32_MAX;
3205                 else
3206                     delta += max1 - min1;
3207                 if (flags & SCF_DO_STCLASS_OR) {
3208                     cl_or(pRExC_state, data->start_class, &accum);
3209                     if (min1) {
3210                         cl_and(data->start_class, and_withp);
3211                         flags &= ~SCF_DO_STCLASS;
3212                     }
3213                 }
3214                 else if (flags & SCF_DO_STCLASS_AND) {
3215                     if (min1) {
3216                         cl_and(data->start_class, &accum);
3217                         flags &= ~SCF_DO_STCLASS;
3218                     }
3219                     else {
3220                         /* Switch to OR mode: cache the old value of
3221                          * data->start_class */
3222                         INIT_AND_WITHP;
3223                         StructCopy(data->start_class, and_withp,
3224                                    struct regnode_charclass_class);
3225                         flags &= ~SCF_DO_STCLASS_AND;
3226                         StructCopy(&accum, data->start_class,
3227                                    struct regnode_charclass_class);
3228                         flags |= SCF_DO_STCLASS_OR;
3229                         SET_SSC_EOS(data->start_class);
3230                     }
3231                 }
3232
3233                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3234                 /* demq.
3235
3236                    Assuming this was/is a branch we are dealing with: 'scan' now
3237                    points at the item that follows the branch sequence, whatever
3238                    it is. We now start at the beginning of the sequence and look
3239                    for subsequences of
3240
3241                    BRANCH->EXACT=>x1
3242                    BRANCH->EXACT=>x2
3243                    tail
3244
3245                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3246
3247                    If we can find such a subsequence we need to turn the first
3248                    element into a trie and then add the subsequent branch exact
3249                    strings to the trie.
3250
3251                    We have two cases
3252
3253                      1. patterns where the whole set of branches can be converted.
3254
3255                      2. patterns where only a subset can be converted.
3256
3257                    In case 1 we can replace the whole set with a single regop
3258                    for the trie. In case 2 we need to keep the start and end
3259                    branches so
3260
3261                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3262                      becomes BRANCH TRIE; BRANCH X;
3263
3264                   There is an additional case, that being where there is a
3265                   common prefix, which gets split out into an EXACT like node
3266                   preceding the TRIE node.
3267
3268                   If x(1..n)==tail then we can do a simple trie, if not we make
3269                   a "jump" trie, such that when we match the appropriate word
3270                   we "jump" to the appropriate tail node. Essentially we turn
3271                   a nested if into a case structure of sorts.
3272
3273                 */
3274
3275                     int made=0;
3276                     if (!re_trie_maxbuff) {
3277                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3278                         if (!SvIOK(re_trie_maxbuff))
3279                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3280                     }
3281                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3282                         regnode *cur;
3283                         regnode *first = (regnode *)NULL;
3284                         regnode *last = (regnode *)NULL;
3285                         regnode *tail = scan;
3286                         U8 trietype = 0;
3287                         U32 count=0;
3288
3289 #ifdef DEBUGGING
3290                         SV * const mysv = sv_newmortal();       /* for dumping */
3291 #endif
3292                         /* var tail is used because there may be a TAIL
3293                            regop in the way. Ie, the exacts will point to the
3294                            thing following the TAIL, but the last branch will
3295                            point at the TAIL. So we advance tail. If we
3296                            have nested (?:) we may have to move through several
3297                            tails.
3298                          */
3299
3300                         while ( OP( tail ) == TAIL ) {
3301                             /* this is the TAIL generated by (?:) */
3302                             tail = regnext( tail );
3303                         }
3304
3305
3306                         DEBUG_TRIE_COMPILE_r({
3307                             regprop(RExC_rx, mysv, tail );
3308                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3309                                 (int)depth * 2 + 2, "",
3310                                 "Looking for TRIE'able sequences. Tail node is: ",
3311                                 SvPV_nolen_const( mysv )
3312                             );
3313                         });
3314
3315                         /*
3316
3317                             Step through the branches
3318                                 cur represents each branch,
3319                                 noper is the first thing to be matched as part of that branch
3320                                 noper_next is the regnext() of that node.
3321
3322                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3323                             via a "jump trie" but we also support building with NOJUMPTRIE,
3324                             which restricts the trie logic to structures like /FOO|BAR/.
3325
3326                             If noper is a trieable nodetype then the branch is a possible optimization
3327                             target. If we are building under NOJUMPTRIE then we require that noper_next
3328                             is the same as scan (our current position in the regex program).
3329
3330                             Once we have two or more consecutive such branches we can create a
3331                             trie of the EXACT's contents and stitch it in place into the program.
3332
3333                             If the sequence represents all of the branches in the alternation we
3334                             replace the entire thing with a single TRIE node.
3335
3336                             Otherwise when it is a subsequence we need to stitch it in place and
3337                             replace only the relevant branches. This means the first branch has
3338                             to remain as it is used by the alternation logic, and its next pointer,
3339                             and needs to be repointed at the item on the branch chain following
3340                             the last branch we have optimized away.
3341
3342                             This could be either a BRANCH, in which case the subsequence is internal,
3343                             or it could be the item following the branch sequence in which case the
3344                             subsequence is at the end (which does not necessarily mean the first node
3345                             is the start of the alternation).
3346
3347                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3348
3349                                 optype          |  trietype
3350                                 ----------------+-----------
3351                                 NOTHING         | NOTHING
3352                                 EXACT           | EXACT
3353                                 EXACTFU         | EXACTFU
3354                                 EXACTFU_SS      | EXACTFU
3355                                 EXACTFU_TRICKYFOLD | EXACTFU
3356                                 EXACTFA         | 0
3357
3358
3359                         */
3360 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3361                        ( EXACT == (X) )   ? EXACT :        \
3362                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3363                        0 )
3364
3365                         /* dont use tail as the end marker for this traverse */
3366                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3367                             regnode * const noper = NEXTOPER( cur );
3368                             U8 noper_type = OP( noper );
3369                             U8 noper_trietype = TRIE_TYPE( noper_type );
3370 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3371                             regnode * const noper_next = regnext( noper );
3372                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3373                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3374 #endif
3375
3376                             DEBUG_TRIE_COMPILE_r({
3377                                 regprop(RExC_rx, mysv, cur);
3378                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3379                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3380
3381                                 regprop(RExC_rx, mysv, noper);
3382                                 PerlIO_printf( Perl_debug_log, " -> %s",
3383                                     SvPV_nolen_const(mysv));
3384
3385                                 if ( noper_next ) {
3386                                   regprop(RExC_rx, mysv, noper_next );
3387                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3388                                     SvPV_nolen_const(mysv));
3389                                 }
3390                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3391                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3392                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3393                                 );
3394                             });
3395
3396                             /* Is noper a trieable nodetype that can be merged with the
3397                              * current trie (if there is one)? */
3398                             if ( noper_trietype
3399                                   &&
3400                                   (
3401                                         ( noper_trietype == NOTHING)
3402                                         || ( trietype == NOTHING )
3403                                         || ( trietype == noper_trietype )
3404                                   )
3405 #ifdef NOJUMPTRIE
3406                                   && noper_next == tail
3407 #endif
3408                                   && count < U16_MAX)
3409                             {
3410                                 /* Handle mergable triable node
3411                                  * Either we are the first node in a new trieable sequence,
3412                                  * in which case we do some bookkeeping, otherwise we update
3413                                  * the end pointer. */
3414                                 if ( !first ) {
3415                                     first = cur;
3416                                     if ( noper_trietype == NOTHING ) {
3417 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3418                                         regnode * const noper_next = regnext( noper );
3419                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3420                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3421 #endif
3422
3423                                         if ( noper_next_trietype ) {
3424                                             trietype = noper_next_trietype;
3425                                         } else if (noper_next_type)  {
3426                                             /* a NOTHING regop is 1 regop wide. We need at least two
3427                                              * for a trie so we can't merge this in */
3428                                             first = NULL;
3429                                         }
3430                                     } else {
3431                                         trietype = noper_trietype;
3432                                     }
3433                                 } else {
3434                                     if ( trietype == NOTHING )
3435                                         trietype = noper_trietype;
3436                                     last = cur;
3437                                 }
3438                                 if (first)
3439                                     count++;
3440                             } /* end handle mergable triable node */
3441                             else {
3442                                 /* handle unmergable node -
3443                                  * noper may either be a triable node which can not be tried
3444                                  * together with the current trie, or a non triable node */
3445                                 if ( last ) {
3446                                     /* If last is set and trietype is not NOTHING then we have found
3447                                      * at least two triable branch sequences in a row of a similar
3448                                      * trietype so we can turn them into a trie. If/when we
3449                                      * allow NOTHING to start a trie sequence this condition will be
3450                                      * required, and it isn't expensive so we leave it in for now. */
3451                                     if ( trietype && trietype != NOTHING )
3452                                         make_trie( pRExC_state,
3453                                                 startbranch, first, cur, tail, count,
3454                                                 trietype, depth+1 );
3455                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3456                                 }
3457                                 if ( noper_trietype
3458 #ifdef NOJUMPTRIE
3459                                      && noper_next == tail
3460 #endif
3461                                 ){
3462                                     /* noper is triable, so we can start a new trie sequence */
3463                                     count = 1;
3464                                     first = cur;
3465                                     trietype = noper_trietype;
3466                                 } else if (first) {
3467                                     /* if we already saw a first but the current node is not triable then we have
3468                                      * to reset the first information. */
3469                                     count = 0;
3470                                     first = NULL;
3471                                     trietype = 0;
3472                                 }
3473                             } /* end handle unmergable node */
3474                         } /* loop over branches */
3475                         DEBUG_TRIE_COMPILE_r({
3476                             regprop(RExC_rx, mysv, cur);
3477                             PerlIO_printf( Perl_debug_log,
3478                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3479                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3480
3481                         });
3482                         if ( last && trietype ) {
3483                             if ( trietype != NOTHING ) {
3484                                 /* the last branch of the sequence was part of a trie,
3485                                  * so we have to construct it here outside of the loop
3486                                  */
3487                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3488 #ifdef TRIE_STUDY_OPT
3489                                 if ( ((made == MADE_EXACT_TRIE &&
3490                                      startbranch == first)
3491                                      || ( first_non_open == first )) &&
3492                                      depth==0 ) {
3493                                     flags |= SCF_TRIE_RESTUDY;
3494                                     if ( startbranch == first
3495                                          && scan == tail )
3496                                     {
3497                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3498                                     }
3499                                 }
3500 #endif
3501                             } else {
3502                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3503                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3504                                  */
3505                                 if ( startbranch == first ) {
3506                                     regnode *opt;
3507                                     /* the entire thing is a NOTHING sequence, something like this:
3508                                      * (?:|) So we can turn it into a plain NOTHING op. */
3509                                     DEBUG_TRIE_COMPILE_r({
3510                                         regprop(RExC_rx, mysv, cur);
3511                                         PerlIO_printf( Perl_debug_log,
3512                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3513                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3514
3515                                     });
3516                                     OP(startbranch)= NOTHING;
3517                                     NEXT_OFF(startbranch)= tail - startbranch;
3518                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3519                                         OP(opt)= OPTIMIZED;
3520                                 }
3521                             }
3522                         } /* end if ( last) */
3523                     } /* TRIE_MAXBUF is non zero */
3524
3525                 } /* do trie */
3526
3527             }
3528             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3529                 scan = NEXTOPER(NEXTOPER(scan));
3530             } else                      /* single branch is optimized. */
3531                 scan = NEXTOPER(scan);
3532             continue;
3533         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3534             scan_frame *newframe = NULL;
3535             I32 paren;
3536             regnode *start;
3537             regnode *end;
3538
3539             if (OP(scan) != SUSPEND) {
3540             /* set the pointer */
3541                 if (OP(scan) == GOSUB) {
3542                     paren = ARG(scan);
3543                     RExC_recurse[ARG2L(scan)] = scan;
3544                     start = RExC_open_parens[paren-1];
3545                     end   = RExC_close_parens[paren-1];
3546                 } else {
3547                     paren = 0;
3548                     start = RExC_rxi->program + 1;
3549                     end   = RExC_opend;
3550                 }
3551                 if (!recursed) {
3552                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3553                     SAVEFREEPV(recursed);
3554                 }
3555                 if (!PAREN_TEST(recursed,paren+1)) {
3556                     PAREN_SET(recursed,paren+1);
3557                     Newx(newframe,1,scan_frame);
3558                 } else {
3559                     if (flags & SCF_DO_SUBSTR) {
3560                         SCAN_COMMIT(pRExC_state,data,minlenp);
3561                         data->longest = &(data->longest_float);
3562                     }
3563                     is_inf = is_inf_internal = 1;
3564                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3565                         cl_anything(pRExC_state, data->start_class);
3566                     flags &= ~SCF_DO_STCLASS;
3567                 }
3568             } else {
3569                 Newx(newframe,1,scan_frame);
3570                 paren = stopparen;
3571                 start = scan+2;
3572                 end = regnext(scan);
3573             }
3574             if (newframe) {
3575                 assert(start);
3576                 assert(end);
3577                 SAVEFREEPV(newframe);
3578                 newframe->next = regnext(scan);
3579                 newframe->last = last;
3580                 newframe->stop = stopparen;
3581                 newframe->prev = frame;
3582
3583                 frame = newframe;
3584                 scan =  start;
3585                 stopparen = paren;
3586                 last = end;
3587
3588                 continue;
3589             }
3590         }
3591         else if (OP(scan) == EXACT) {
3592             I32 l = STR_LEN(scan);
3593             UV uc;
3594             if (UTF) {
3595                 const U8 * const s = (U8*)STRING(scan);
3596                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3597                 l = utf8_length(s, s + l);
3598             } else {
3599                 uc = *((U8*)STRING(scan));
3600             }
3601             min += l;
3602             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3603                 /* The code below prefers earlier match for fixed
3604                    offset, later match for variable offset.  */
3605                 if (data->last_end == -1) { /* Update the start info. */
3606                     data->last_start_min = data->pos_min;
3607                     data->last_start_max = is_inf
3608                         ? I32_MAX : data->pos_min + data->pos_delta;
3609                 }
3610                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3611                 if (UTF)
3612                     SvUTF8_on(data->last_found);
3613                 {
3614                     SV * const sv = data->last_found;
3615                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3616                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3617                     if (mg && mg->mg_len >= 0)
3618                         mg->mg_len += utf8_length((U8*)STRING(scan),
3619                                                   (U8*)STRING(scan)+STR_LEN(scan));
3620                 }
3621                 data->last_end = data->pos_min + l;
3622                 data->pos_min += l; /* As in the first entry. */
3623                 data->flags &= ~SF_BEFORE_EOL;
3624             }
3625             if (flags & SCF_DO_STCLASS_AND) {
3626                 /* Check whether it is compatible with what we know already! */
3627                 int compat = 1;
3628
3629
3630                 /* If compatible, we or it in below.  It is compatible if is
3631                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3632                  * it's for a locale.  Even if there isn't unicode semantics
3633                  * here, at runtime there may be because of matching against a
3634                  * utf8 string, so accept a possible false positive for
3635                  * latin1-range folds */
3636                 if (uc >= 0x100 ||
3637                     (!(data->start_class->flags & ANYOF_LOCALE)
3638                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3639                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3640                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3641                     )
3642                 {
3643                     compat = 0;
3644                 }
3645                 ANYOF_CLASS_ZERO(data->start_class);
3646                 ANYOF_BITMAP_ZERO(data->start_class);
3647                 if (compat)
3648                     ANYOF_BITMAP_SET(data->start_class, uc);
3649                 else if (uc >= 0x100) {
3650                     int i;
3651
3652                     /* Some Unicode code points fold to the Latin1 range; as
3653                      * XXX temporary code, instead of figuring out if this is
3654                      * one, just assume it is and set all the start class bits
3655                      * that could be some such above 255 code point's fold
3656                      * which will generate fals positives.  As the code
3657                      * elsewhere that does compute the fold settles down, it
3658                      * can be extracted out and re-used here */
3659                     for (i = 0; i < 256; i++){
3660                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3661                             ANYOF_BITMAP_SET(data->start_class, i);
3662                         }
3663                     }
3664                 }
3665                 CLEAR_SSC_EOS(data->start_class);
3666                 if (uc < 0x100)
3667                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3668             }
3669             else if (flags & SCF_DO_STCLASS_OR) {
3670                 /* false positive possible if the class is case-folded */
3671                 if (uc < 0x100)
3672                     ANYOF_BITMAP_SET(data->start_class, uc);
3673                 else
3674                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3675                 CLEAR_SSC_EOS(data->start_class);
3676                 cl_and(data->start_class, and_withp);
3677             }
3678             flags &= ~SCF_DO_STCLASS;
3679         }
3680         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3681             I32 l = STR_LEN(scan);
3682             UV uc = *((U8*)STRING(scan));
3683
3684             /* Search for fixed substrings supports EXACT only. */
3685             if (flags & SCF_DO_SUBSTR) {
3686                 assert(data);
3687                 SCAN_COMMIT(pRExC_state, data, minlenp);
3688             }
3689             if (UTF) {
3690                 const U8 * const s = (U8 *)STRING(scan);
3691                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3692                 l = utf8_length(s, s + l);
3693             }
3694             if (has_exactf_sharp_s) {
3695                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3696             }
3697             min += l - min_subtract;
3698             assert (min >= 0);
3699             delta += min_subtract;
3700             if (flags & SCF_DO_SUBSTR) {
3701                 data->pos_min += l - min_subtract;
3702                 if (data->pos_min < 0) {
3703                     data->pos_min = 0;
3704                 }
3705                 data->pos_delta += min_subtract;
3706                 if (min_subtract) {
3707                     data->longest = &(data->longest_float);
3708                 }
3709             }
3710             if (flags & SCF_DO_STCLASS_AND) {
3711                 /* Check whether it is compatible with what we know already! */
3712                 int compat = 1;
3713                 if (uc >= 0x100 ||
3714                  (!(data->start_class->flags & ANYOF_LOCALE)
3715                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3716                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3717                 {
3718                     compat = 0;
3719                 }
3720                 ANYOF_CLASS_ZERO(data->start_class);
3721                 ANYOF_BITMAP_ZERO(data->start_class);
3722                 if (compat) {
3723                     ANYOF_BITMAP_SET(data->start_class, uc);
3724                     CLEAR_SSC_EOS(data->start_class);
3725                     if (OP(scan) == EXACTFL) {
3726                         /* XXX This set is probably no longer necessary, and
3727                          * probably wrong as LOCALE now is on in the initial
3728                          * state */
3729                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3730                     }
3731                     else {
3732
3733                         /* Also set the other member of the fold pair.  In case
3734                          * that unicode semantics is called for at runtime, use
3735                          * the full latin1 fold.  (Can't do this for locale,
3736                          * because not known until runtime) */
3737                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3738
3739                         /* All other (EXACTFL handled above) folds except under
3740                          * /iaa that include s, S, and sharp_s also may include
3741                          * the others */
3742                         if (OP(scan) != EXACTFA) {
3743                             if (uc == 's' || uc == 'S') {
3744                                 ANYOF_BITMAP_SET(data->start_class,
3745                                                  LATIN_SMALL_LETTER_SHARP_S);
3746                             }
3747                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3748                                 ANYOF_BITMAP_SET(data->start_class, 's');
3749                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3750                             }
3751                         }
3752                     }
3753                 }
3754                 else if (uc >= 0x100) {
3755                     int i;
3756                     for (i = 0; i < 256; i++){
3757                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3758                             ANYOF_BITMAP_SET(data->start_class, i);
3759                         }
3760                     }
3761                 }
3762             }
3763             else if (flags & SCF_DO_STCLASS_OR) {
3764                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3765                     /* false positive possible if the class is case-folded.
3766                        Assume that the locale settings are the same... */
3767                     if (uc < 0x100) {
3768                         ANYOF_BITMAP_SET(data->start_class, uc);
3769                         if (OP(scan) != EXACTFL) {
3770
3771                             /* And set the other member of the fold pair, but
3772                              * can't do that in locale because not known until
3773                              * run-time */
3774                             ANYOF_BITMAP_SET(data->start_class,
3775                                              PL_fold_latin1[uc]);
3776
3777                             /* All folds except under /iaa that include s, S,
3778                              * and sharp_s also may include the others */
3779                             if (OP(scan) != EXACTFA) {
3780                                 if (uc == 's' || uc == 'S') {
3781                                     ANYOF_BITMAP_SET(data->start_class,
3782                                                    LATIN_SMALL_LETTER_SHARP_S);
3783                                 }
3784                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3785                                     ANYOF_BITMAP_SET(data->start_class, 's');
3786                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3787                                 }
3788                             }
3789                         }
3790                     }
3791                     CLEAR_SSC_EOS(data->start_class);
3792                 }
3793                 cl_and(data->start_class, and_withp);
3794             }
3795             flags &= ~SCF_DO_STCLASS;
3796         }
3797         else if (REGNODE_VARIES(OP(scan))) {
3798             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3799             I32 f = flags, pos_before = 0;
3800             regnode * const oscan = scan;
3801             struct regnode_charclass_class this_class;
3802             struct regnode_charclass_class *oclass = NULL;
3803             I32 next_is_eval = 0;
3804
3805             switch (PL_regkind[OP(scan)]) {
3806             case WHILEM:                /* End of (?:...)* . */
3807                 scan = NEXTOPER(scan);
3808                 goto finish;
3809             case PLUS:
3810                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3811                     next = NEXTOPER(scan);
3812                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3813                         mincount = 1;
3814                         maxcount = REG_INFTY;
3815                         next = regnext(scan);
3816                         scan = NEXTOPER(scan);
3817                         goto do_curly;
3818                     }
3819                 }
3820                 if (flags & SCF_DO_SUBSTR)
3821                     data->pos_min++;
3822                 min++;
3823                 /* Fall through. */
3824             case STAR:
3825                 if (flags & SCF_DO_STCLASS) {
3826                     mincount = 0;
3827                     maxcount = REG_INFTY;
3828                     next = regnext(scan);
3829                     scan = NEXTOPER(scan);
3830                     goto do_curly;
3831                 }
3832                 is_inf = is_inf_internal = 1;
3833                 scan = regnext(scan);
3834                 if (flags & SCF_DO_SUBSTR) {
3835                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3836                     data->longest = &(data->longest_float);
3837                 }
3838                 goto optimize_curly_tail;
3839             case CURLY:
3840                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3841                     && (scan->flags == stopparen))
3842                 {
3843                     mincount = 1;
3844                     maxcount = 1;
3845                 } else {
3846                     mincount = ARG1(scan);
3847                     maxcount = ARG2(scan);
3848                 }
3849                 next = regnext(scan);
3850                 if (OP(scan) == CURLYX) {
3851                     I32 lp = (data ? *(data->last_closep) : 0);
3852                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3853                 }
3854                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3855                 next_is_eval = (OP(scan) == EVAL);
3856               do_curly:
3857                 if (flags & SCF_DO_SUBSTR) {
3858                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3859                     pos_before = data->pos_min;
3860                 }
3861                 if (data) {
3862                     fl = data->flags;
3863                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3864                     if (is_inf)
3865                         data->flags |= SF_IS_INF;
3866                 }
3867                 if (flags & SCF_DO_STCLASS) {
3868                     cl_init(pRExC_state, &this_class);
3869                     oclass = data->start_class;
3870                     data->start_class = &this_class;
3871                     f |= SCF_DO_STCLASS_AND;
3872                     f &= ~SCF_DO_STCLASS_OR;
3873                 }
3874                 /* Exclude from super-linear cache processing any {n,m}
3875                    regops for which the combination of input pos and regex
3876                    pos is not enough information to determine if a match
3877                    will be possible.
3878
3879                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3880                    regex pos at the \s*, the prospects for a match depend not
3881                    only on the input position but also on how many (bar\s*)
3882                    repeats into the {4,8} we are. */
3883                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3884                     f &= ~SCF_WHILEM_VISITED_POS;
3885
3886                 /* This will finish on WHILEM, setting scan, or on NULL: */
3887                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3888                                       last, data, stopparen, recursed, NULL,
3889                                       (mincount == 0
3890                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3891
3892                 if (flags & SCF_DO_STCLASS)
3893                     data->start_class = oclass;
3894                 if (mincount == 0 || minnext == 0) {
3895                     if (flags & SCF_DO_STCLASS_OR) {
3896                         cl_or(pRExC_state, data->start_class, &this_class);
3897                     }
3898                     else if (flags & SCF_DO_STCLASS_AND) {
3899                         /* Switch to OR mode: cache the old value of
3900                          * data->start_class */
3901                         INIT_AND_WITHP;
3902                         StructCopy(data->start_class, and_withp,
3903                                    struct regnode_charclass_class);
3904                         flags &= ~SCF_DO_STCLASS_AND;
3905                         StructCopy(&this_class, data->start_class,
3906                                    struct regnode_charclass_class);
3907                         flags |= SCF_DO_STCLASS_OR;
3908                         SET_SSC_EOS(data->start_class);
3909                     }
3910                 } else {                /* Non-zero len */
3911                     if (flags & SCF_DO_STCLASS_OR) {
3912                         cl_or(pRExC_state, data->start_class, &this_class);
3913                         cl_and(data->start_class, and_withp);
3914                     }
3915                     else if (flags & SCF_DO_STCLASS_AND)
3916                         cl_and(data->start_class, &this_class);
3917                     flags &= ~SCF_DO_STCLASS;
3918                 }
3919                 if (!scan)              /* It was not CURLYX, but CURLY. */
3920                     scan = next;
3921                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
3922                     /* ? quantifier ok, except for (?{ ... }) */
3923                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
3924                     && (minnext == 0) && (deltanext == 0)
3925                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3926                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3927                 {
3928                     /* Fatal warnings may leak the regexp without this: */
3929                     SAVEFREESV(RExC_rx_sv);
3930                     ckWARNreg(RExC_parse,
3931                               "Quantifier unexpected on zero-length expression");
3932                     (void)ReREFCNT_inc(RExC_rx_sv);
3933                 }
3934
3935                 min += minnext * mincount;
3936                 is_inf_internal |= deltanext == I32_MAX
3937                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
3938                 is_inf |= is_inf_internal;
3939                 if (is_inf)
3940                     delta = I32_MAX;
3941                 else
3942                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
3943
3944                 /* Try powerful optimization CURLYX => CURLYN. */
3945                 if (  OP(oscan) == CURLYX && data
3946                       && data->flags & SF_IN_PAR
3947                       && !(data->flags & SF_HAS_EVAL)
3948                       && !deltanext && minnext == 1 ) {
3949                     /* Try to optimize to CURLYN.  */
3950                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3951                     regnode * const nxt1 = nxt;
3952 #ifdef DEBUGGING
3953                     regnode *nxt2;
3954 #endif
3955
3956                     /* Skip open. */
3957                     nxt = regnext(nxt);
3958                     if (!REGNODE_SIMPLE(OP(nxt))
3959                         && !(PL_regkind[OP(nxt)] == EXACT
3960                              && STR_LEN(nxt) == 1))
3961                         goto nogo;
3962 #ifdef DEBUGGING
3963                     nxt2 = nxt;
3964 #endif
3965                     nxt = regnext(nxt);
3966                     if (OP(nxt) != CLOSE)
3967                         goto nogo;
3968                     if (RExC_open_parens) {
3969                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3970                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3971                     }
3972                     /* Now we know that nxt2 is the only contents: */
3973                     oscan->flags = (U8)ARG(nxt);
3974                     OP(oscan) = CURLYN;
3975                     OP(nxt1) = NOTHING; /* was OPEN. */
3976
3977 #ifdef DEBUGGING
3978                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3979                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3980                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3981                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3982                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3983                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3984 #endif
3985                 }
3986               nogo:
3987
3988                 /* Try optimization CURLYX => CURLYM. */
3989                 if (  OP(oscan) == CURLYX && data
3990                       && !(data->flags & SF_HAS_PAR)
3991                       && !(data->flags & SF_HAS_EVAL)
3992                       && !deltanext     /* atom is fixed width */
3993                       && minnext != 0   /* CURLYM can't handle zero width */
3994                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3995                 ) {
3996                     /* XXXX How to optimize if data == 0? */
3997                     /* Optimize to a simpler form.  */
3998                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3999                     regnode *nxt2;
4000
4001                     OP(oscan) = CURLYM;
4002                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4003                             && (OP(nxt2) != WHILEM))
4004                         nxt = nxt2;
4005                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4006                     /* Need to optimize away parenths. */
4007                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4008                         /* Set the parenth number.  */
4009                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4010
4011                         oscan->flags = (U8)ARG(nxt);
4012                         if (RExC_open_parens) {
4013                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4014                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4015                         }
4016                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4017                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4018
4019 #ifdef DEBUGGING
4020                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4021                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4022                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4023                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4024 #endif
4025 #if 0
4026                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4027                             regnode *nnxt = regnext(nxt1);
4028                             if (nnxt == nxt) {
4029                                 if (reg_off_by_arg[OP(nxt1)])
4030                                     ARG_SET(nxt1, nxt2 - nxt1);
4031                                 else if (nxt2 - nxt1 < U16_MAX)
4032                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4033                                 else
4034                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4035                             }
4036                             nxt1 = nnxt;
4037                         }
4038 #endif
4039                         /* Optimize again: */
4040                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4041                                     NULL, stopparen, recursed, NULL, 0,depth+1);
4042                     }
4043                     else
4044                         oscan->flags = 0;
4045                 }
4046                 else if ((OP(oscan) == CURLYX)
4047                          && (flags & SCF_WHILEM_VISITED_POS)
4048                          /* See the comment on a similar expression above.
4049                             However, this time it's not a subexpression
4050                             we care about, but the expression itself. */
4051                          && (maxcount == REG_INFTY)
4052                          && data && ++data->whilem_c < 16) {
4053                     /* This stays as CURLYX, we can put the count/of pair. */
4054                     /* Find WHILEM (as in regexec.c) */
4055                     regnode *nxt = oscan + NEXT_OFF(oscan);
4056
4057                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4058                         nxt += ARG(nxt);
4059                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4060                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4061                 }
4062                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4063                     pars++;
4064                 if (flags & SCF_DO_SUBSTR) {
4065                     SV *last_str = NULL;
4066                     int counted = mincount != 0;
4067
4068                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4069 #if defined(SPARC64_GCC_WORKAROUND)
4070                         I32 b = 0;
4071                         STRLEN l = 0;
4072                         const char *s = NULL;
4073                         I32 old = 0;
4074
4075                         if (pos_before >= data->last_start_min)
4076                             b = pos_before;
4077                         else
4078                             b = data->last_start_min;
4079
4080                         l = 0;
4081                         s = SvPV_const(data->last_found, l);
4082                         old = b - data->last_start_min;
4083
4084 #else
4085                         I32 b = pos_before >= data->last_start_min
4086                             ? pos_before : data->last_start_min;
4087                         STRLEN l;
4088                         const char * const s = SvPV_const(data->last_found, l);
4089                         I32 old = b - data->last_start_min;
4090 #endif
4091
4092                         if (UTF)
4093                             old = utf8_hop((U8*)s, old) - (U8*)s;
4094                         l -= old;
4095                         /* Get the added string: */
4096                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4097                         if (deltanext == 0 && pos_before == b) {
4098                             /* What was added is a constant string */
4099                             if (mincount > 1) {
4100                                 SvGROW(last_str, (mincount * l) + 1);
4101                                 repeatcpy(SvPVX(last_str) + l,
4102                                           SvPVX_const(last_str), l, mincount - 1);
4103                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4104                                 /* Add additional parts. */
4105                                 SvCUR_set(data->last_found,
4106                                           SvCUR(data->last_found) - l);
4107                                 sv_catsv(data->last_found, last_str);
4108                                 {
4109                                     SV * sv = data->last_found;
4110                                     MAGIC *mg =
4111                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4112                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4113                                     if (mg && mg->mg_len >= 0)
4114                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4115                                 }
4116                                 data->last_end += l * (mincount - 1);
4117                             }
4118                         } else {
4119                             /* start offset must point into the last copy */
4120                             data->last_start_min += minnext * (mincount - 1);
4121                             data->last_start_max += is_inf ? I32_MAX
4122                                 : (maxcount - 1) * (minnext + data->pos_delta);
4123                         }
4124                     }
4125                     /* It is counted once already... */
4126                     data->pos_min += minnext * (mincount - counted);
4127 #if 0
4128 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4129     counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4130 if (deltanext != I32_MAX)
4131 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4132 #endif
4133                     if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4134                         data->pos_delta = I32_MAX;
4135                     else
4136                         data->pos_delta += - counted * deltanext +
4137                         (minnext + deltanext) * maxcount - minnext * mincount;
4138                     if (mincount != maxcount) {
4139                          /* Cannot extend fixed substrings found inside
4140                             the group.  */
4141                         SCAN_COMMIT(pRExC_state,data,minlenp);
4142                         if (mincount && last_str) {
4143                             SV * const sv = data->last_found;
4144                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4145                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4146
4147                             if (mg)
4148                                 mg->mg_len = -1;
4149                             sv_setsv(sv, last_str);
4150                             data->last_end = data->pos_min;
4151                             data->last_start_min =
4152                                 data->pos_min - CHR_SVLEN(last_str);
4153                             data->last_start_max = is_inf
4154                                 ? I32_MAX
4155                                 : data->pos_min + data->pos_delta
4156                                 - CHR_SVLEN(last_str);
4157                         }
4158                         data->longest = &(data->longest_float);
4159                     }
4160                     SvREFCNT_dec(last_str);
4161                 }
4162                 if (data && (fl & SF_HAS_EVAL))
4163                     data->flags |= SF_HAS_EVAL;
4164               optimize_curly_tail:
4165                 if (OP(oscan) != CURLYX) {
4166                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4167                            && NEXT_OFF(next))
4168                         NEXT_OFF(oscan) += NEXT_OFF(next);
4169                 }
4170                 continue;
4171             default:                    /* REF, and CLUMP only? */
4172                 if (flags & SCF_DO_SUBSTR) {
4173                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4174                     data->longest = &(data->longest_float);
4175                 }
4176                 is_inf = is_inf_internal = 1;
4177                 if (flags & SCF_DO_STCLASS_OR)
4178                     cl_anything(pRExC_state, data->start_class);
4179                 flags &= ~SCF_DO_STCLASS;
4180                 break;
4181             }
4182         }
4183         else if (OP(scan) == LNBREAK) {
4184             if (flags & SCF_DO_STCLASS) {
4185                 int value = 0;
4186                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4187                 if (flags & SCF_DO_STCLASS_AND) {
4188                     for (value = 0; value < 256; value++)
4189                         if (!is_VERTWS_cp(value))
4190                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4191                 }
4192                 else {
4193                     for (value = 0; value < 256; value++)
4194                         if (is_VERTWS_cp(value))
4195                             ANYOF_BITMAP_SET(data->start_class, value);
4196                 }
4197                 if (flags & SCF_DO_STCLASS_OR)
4198                     cl_and(data->start_class, and_withp);
4199                 flags &= ~SCF_DO_STCLASS;
4200             }
4201             min++;
4202             delta++;    /* Because of the 2 char string cr-lf */
4203             if (flags & SCF_DO_SUBSTR) {
4204                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4205                 data->pos_min += 1;
4206                 data->pos_delta += 1;
4207                 data->longest = &(data->longest_float);
4208             }
4209         }
4210         else if (REGNODE_SIMPLE(OP(scan))) {
4211             int value = 0;
4212
4213             if (flags & SCF_DO_SUBSTR) {
4214                 SCAN_COMMIT(pRExC_state,data,minlenp);
4215                 data->pos_min++;
4216             }
4217             min++;
4218             if (flags & SCF_DO_STCLASS) {
4219                 int loop_max = 256;
4220                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4221
4222                 /* Some of the logic below assumes that switching
4223                    locale on will only add false positives. */
4224                 switch (PL_regkind[OP(scan)]) {
4225                     U8 classnum;
4226
4227                 case SANY:
4228                 default:
4229 #ifdef DEBUGGING
4230                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4231 #endif
4232                  do_default:
4233                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4234                         cl_anything(pRExC_state, data->start_class);
4235                     break;
4236                 case REG_ANY:
4237                     if (OP(scan) == SANY)
4238                         goto do_default;
4239                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4240                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4241                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4242                         cl_anything(pRExC_state, data->start_class);
4243                     }
4244                     if (flags & SCF_DO_STCLASS_AND || !value)
4245                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4246                     break;
4247                 case ANYOF:
4248                     if (flags & SCF_DO_STCLASS_AND)
4249                         cl_and(data->start_class,
4250                                (struct regnode_charclass_class*)scan);
4251                     else
4252                         cl_or(pRExC_state, data->start_class,
4253                               (struct regnode_charclass_class*)scan);
4254                     break;
4255                 case POSIXA:
4256                     loop_max = 128;
4257                     /* FALL THROUGH */
4258                 case POSIXL:
4259                 case POSIXD:
4260                 case POSIXU:
4261                     classnum = FLAGS(scan);
4262                     if (flags & SCF_DO_STCLASS_AND) {
4263                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4264                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4265                             for (value = 0; value < loop_max; value++) {
4266                                 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4267                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4268                                 }
4269                             }
4270                         }
4271                     }
4272                     else {
4273                         if (data->start_class->flags & ANYOF_LOCALE) {
4274                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4275                         }
4276                         else {
4277
4278                         /* Even if under locale, set the bits for non-locale
4279                          * in case it isn't a true locale-node.  This will
4280                          * create false positives if it truly is locale */
4281                         for (value = 0; value < loop_max; value++) {
4282                             if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4283                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4284                             }
4285                         }
4286                         }
4287                     }
4288                     break;
4289                 case NPOSIXA:
4290                     loop_max = 128;
4291                     /* FALL THROUGH */
4292                 case NPOSIXL:
4293                 case NPOSIXU:
4294                 case NPOSIXD:
4295                     classnum = FLAGS(scan);
4296                     if (flags & SCF_DO_STCLASS_AND) {
4297                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4298                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4299                             for (value = 0; value < loop_max; value++) {
4300                                 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4301                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4302                                 }
4303                             }
4304                         }
4305                     }
4306                     else {
4307                         if (data->start_class->flags & ANYOF_LOCALE) {
4308                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4309                         }
4310                         else {
4311
4312                         /* Even if under locale, set the bits for non-locale in
4313                          * case it isn't a true locale-node.  This will create
4314                          * false positives if it truly is locale */
4315                         for (value = 0; value < loop_max; value++) {
4316                             if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4317                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4318                             }
4319                         }
4320                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4321                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4322                         }
4323                         }
4324                     }
4325                     break;
4326                 }
4327                 if (flags & SCF_DO_STCLASS_OR)
4328                     cl_and(data->start_class, and_withp);
4329                 flags &= ~SCF_DO_STCLASS;
4330             }
4331         }
4332         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4333             data->flags |= (OP(scan) == MEOL
4334                             ? SF_BEFORE_MEOL
4335                             : SF_BEFORE_SEOL);
4336             SCAN_COMMIT(pRExC_state, data, minlenp);
4337
4338         }
4339         else if (  PL_regkind[OP(scan)] == BRANCHJ
4340                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4341                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4342                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4343             if ( OP(scan) == UNLESSM &&
4344                  scan->flags == 0 &&
4345                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4346                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4347             ) {
4348                 regnode *opt;
4349                 regnode *upto= regnext(scan);
4350                 DEBUG_PARSE_r({
4351                     SV * const mysv_val=sv_newmortal();
4352                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4353
4354                     /*DEBUG_PARSE_MSG("opfail");*/
4355                     regprop(RExC_rx, mysv_val, upto);
4356                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4357                                   SvPV_nolen_const(mysv_val),
4358                                   (IV)REG_NODE_NUM(upto),
4359                                   (IV)(upto - scan)
4360                     );
4361                 });
4362                 OP(scan) = OPFAIL;
4363                 NEXT_OFF(scan) = upto - scan;
4364                 for (opt= scan + 1; opt < upto ; opt++)
4365                     OP(opt) = OPTIMIZED;
4366                 scan= upto;
4367                 continue;
4368             }
4369             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4370                 || OP(scan) == UNLESSM )
4371             {
4372                 /* Negative Lookahead/lookbehind
4373                    In this case we can't do fixed string optimisation.
4374                 */
4375
4376                 I32 deltanext, minnext, fake = 0;
4377                 regnode *nscan;
4378                 struct regnode_charclass_class intrnl;
4379                 int f = 0;
4380
4381                 data_fake.flags = 0;
4382                 if (data) {
4383                     data_fake.whilem_c = data->whilem_c;
4384                     data_fake.last_closep = data->last_closep;
4385                 }
4386                 else
4387                     data_fake.last_closep = &fake;
4388                 data_fake.pos_delta = delta;
4389                 if ( flags & SCF_DO_STCLASS && !scan->flags
4390                      && OP(scan) == IFMATCH ) { /* Lookahead */
4391                     cl_init(pRExC_state, &intrnl);
4392                     data_fake.start_class = &intrnl;
4393                     f |= SCF_DO_STCLASS_AND;
4394                 }
4395                 if (flags & SCF_WHILEM_VISITED_POS)
4396                     f |= SCF_WHILEM_VISITED_POS;
4397                 next = regnext(scan);
4398                 nscan = NEXTOPER(NEXTOPER(scan));
4399                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4400                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4401                 if (scan->flags) {
4402                     if (deltanext) {
4403                         FAIL("Variable length lookbehind not implemented");
4404                     }
4405                     else if (minnext > (I32)U8_MAX) {
4406                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4407                     }
4408                     scan->flags = (U8)minnext;
4409                 }
4410                 if (data) {
4411                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4412                         pars++;
4413                     if (data_fake.flags & SF_HAS_EVAL)
4414                         data->flags |= SF_HAS_EVAL;
4415                     data->whilem_c = data_fake.whilem_c;
4416                 }
4417                 if (f & SCF_DO_STCLASS_AND) {
4418                     if (flags & SCF_DO_STCLASS_OR) {
4419                         /* OR before, AND after: ideally we would recurse with
4420                          * data_fake to get the AND applied by study of the
4421                          * remainder of the pattern, and then derecurse;
4422                          * *** HACK *** for now just treat as "no information".
4423                          * See [perl #56690].
4424                          */
4425                         cl_init(pRExC_state, data->start_class);
4426                     }  else {
4427                         /* AND before and after: combine and continue */
4428                         const int was = TEST_SSC_EOS(data->start_class);
4429
4430                         cl_and(data->start_class, &intrnl);
4431                         if (was)
4432                             SET_SSC_EOS(data->start_class);
4433                     }
4434                 }
4435             }
4436 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4437             else {
4438                 /* Positive Lookahead/lookbehind
4439                    In this case we can do fixed string optimisation,
4440                    but we must be careful about it. Note in the case of
4441                    lookbehind the positions will be offset by the minimum
4442                    length of the pattern, something we won't know about
4443                    until after the recurse.
4444                 */
4445                 I32 deltanext, fake = 0;
4446                 regnode *nscan;
4447                 struct regnode_charclass_class intrnl;
4448                 int f = 0;
4449                 /* We use SAVEFREEPV so that when the full compile
4450                     is finished perl will clean up the allocated
4451                     minlens when it's all done. This way we don't
4452                     have to worry about freeing them when we know
4453                     they wont be used, which would be a pain.
4454                  */
4455                 I32 *minnextp;
4456                 Newx( minnextp, 1, I32 );
4457                 SAVEFREEPV(minnextp);
4458
4459                 if (data) {
4460                     StructCopy(data, &data_fake, scan_data_t);
4461                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4462                         f |= SCF_DO_SUBSTR;
4463                         if (scan->flags)
4464                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4465                         data_fake.last_found=newSVsv(data->last_found);
4466                     }
4467                 }
4468                 else
4469                     data_fake.last_closep = &fake;
4470                 data_fake.flags = 0;
4471                 data_fake.pos_delta = delta;
4472                 if (is_inf)
4473                     data_fake.flags |= SF_IS_INF;
4474                 if ( flags & SCF_DO_STCLASS && !scan->flags
4475                      && OP(scan) == IFMATCH ) { /* Lookahead */
4476                     cl_init(pRExC_state, &intrnl);
4477                     data_fake.start_class = &intrnl;
4478                     f |= SCF_DO_STCLASS_AND;
4479                 }
4480                 if (flags & SCF_WHILEM_VISITED_POS)
4481                     f |= SCF_WHILEM_VISITED_POS;
4482                 next = regnext(scan);
4483                 nscan = NEXTOPER(NEXTOPER(scan));
4484
4485                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4486                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4487                 if (scan->flags) {
4488                     if (deltanext) {
4489                         FAIL("Variable length lookbehind not implemented");
4490                     }
4491                     else if (*minnextp > (I32)U8_MAX) {
4492                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4493                     }
4494                     scan->flags = (U8)*minnextp;
4495                 }
4496
4497                 *minnextp += min;
4498
4499                 if (f & SCF_DO_STCLASS_AND) {
4500                     const int was = TEST_SSC_EOS(data.start_class);
4501
4502                     cl_and(data->start_class, &intrnl);
4503                     if (was)
4504                         SET_SSC_EOS(data->start_class);
4505                 }
4506                 if (data) {
4507                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4508                         pars++;
4509                     if (data_fake.flags & SF_HAS_EVAL)
4510                         data->flags |= SF_HAS_EVAL;
4511                     data->whilem_c = data_fake.whilem_c;
4512                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4513                         if (RExC_rx->minlen<*minnextp)
4514                             RExC_rx->minlen=*minnextp;
4515                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4516                         SvREFCNT_dec_NN(data_fake.last_found);
4517
4518                         if ( data_fake.minlen_fixed != minlenp )
4519                         {
4520                             data->offset_fixed= data_fake.offset_fixed;
4521                             data->minlen_fixed= data_fake.minlen_fixed;
4522                             data->lookbehind_fixed+= scan->flags;
4523                         }
4524                         if ( data_fake.minlen_float != minlenp )
4525                         {
4526                             data->minlen_float= data_fake.minlen_float;
4527                             data->offset_float_min=data_fake.offset_float_min;
4528                             data->offset_float_max=data_fake.offset_float_max;
4529                             data->lookbehind_float+= scan->flags;
4530                         }
4531                     }
4532                 }
4533             }
4534 #endif
4535         }
4536         else if (OP(scan) == OPEN) {
4537             if (stopparen != (I32)ARG(scan))
4538                 pars++;
4539         }
4540         else if (OP(scan) == CLOSE) {
4541             if (stopparen == (I32)ARG(scan)) {
4542                 break;
4543             }
4544             if ((I32)ARG(scan) == is_par) {
4545                 next = regnext(scan);
4546
4547                 if ( next && (OP(next) != WHILEM) && next < last)
4548                     is_par = 0;         /* Disable optimization */
4549             }
4550             if (data)
4551                 *(data->last_closep) = ARG(scan);
4552         }
4553         else if (OP(scan) == EVAL) {
4554                 if (data)
4555                     data->flags |= SF_HAS_EVAL;
4556         }
4557         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4558             if (flags & SCF_DO_SUBSTR) {
4559                 SCAN_COMMIT(pRExC_state,data,minlenp);
4560                 flags &= ~SCF_DO_SUBSTR;
4561             }
4562             if (data && OP(scan)==ACCEPT) {
4563                 data->flags |= SCF_SEEN_ACCEPT;
4564                 if (stopmin > min)
4565                     stopmin = min;
4566             }
4567         }
4568         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4569         {
4570                 if (flags & SCF_DO_SUBSTR) {
4571                     SCAN_COMMIT(pRExC_state,data,minlenp);
4572                     data->longest = &(data->longest_float);
4573                 }
4574                 is_inf = is_inf_internal = 1;
4575                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4576                     cl_anything(pRExC_state, data->start_class);
4577                 flags &= ~SCF_DO_STCLASS;
4578         }
4579         else if (OP(scan) == GPOS) {
4580             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4581                 !(delta || is_inf || (data && data->pos_delta)))
4582             {
4583                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4584                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4585                 if (RExC_rx->gofs < (U32)min)
4586                     RExC_rx->gofs = min;
4587             } else {
4588                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4589                 RExC_rx->gofs = 0;
4590             }
4591         }
4592 #ifdef TRIE_STUDY_OPT
4593 #ifdef FULL_TRIE_STUDY
4594         else if (PL_regkind[OP(scan)] == TRIE) {
4595             /* NOTE - There is similar code to this block above for handling
4596                BRANCH nodes on the initial study.  If you change stuff here
4597                check there too. */
4598             regnode *trie_node= scan;
4599             regnode *tail= regnext(scan);
4600             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4601             I32 max1 = 0, min1 = I32_MAX;
4602             struct regnode_charclass_class accum;
4603
4604             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4605                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4606             if (flags & SCF_DO_STCLASS)
4607                 cl_init_zero(pRExC_state, &accum);
4608
4609             if (!trie->jump) {
4610                 min1= trie->minlen;
4611                 max1= trie->maxlen;
4612             } else {
4613                 const regnode *nextbranch= NULL;
4614                 U32 word;
4615
4616                 for ( word=1 ; word <= trie->wordcount ; word++)
4617                 {
4618                     I32 deltanext=0, minnext=0, f = 0, fake;
4619                     struct regnode_charclass_class this_class;
4620
4621                     data_fake.flags = 0;
4622                     if (data) {
4623                         data_fake.whilem_c = data->whilem_c;
4624                         data_fake.last_closep = data->last_closep;
4625                     }
4626                     else
4627                         data_fake.last_closep = &fake;
4628                     data_fake.pos_delta = delta;
4629                     if (flags & SCF_DO_STCLASS) {
4630                         cl_init(pRExC_state, &this_class);
4631                         data_fake.start_class = &this_class;
4632                         f = SCF_DO_STCLASS_AND;
4633                     }
4634                     if (flags & SCF_WHILEM_VISITED_POS)
4635                         f |= SCF_WHILEM_VISITED_POS;
4636
4637                     if (trie->jump[word]) {
4638                         if (!nextbranch)
4639                             nextbranch = trie_node + trie->jump[0];
4640                         scan= trie_node + trie->jump[word];
4641                         /* We go from the jump point to the branch that follows
4642                            it. Note this means we need the vestigal unused branches
4643                            even though they arent otherwise used.
4644                          */
4645                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4646                             &deltanext, (regnode *)nextbranch, &data_fake,
4647                             stopparen, recursed, NULL, f,depth+1);
4648                     }
4649                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4650                         nextbranch= regnext((regnode*)nextbranch);
4651
4652                     if (min1 > (I32)(minnext + trie->minlen))
4653                         min1 = minnext + trie->minlen;
4654                     if (deltanext == I32_MAX) {
4655                         is_inf = is_inf_internal = 1;
4656                         max1 = I32_MAX;
4657                     } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4658                         max1 = minnext + deltanext + trie->maxlen;
4659
4660                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4661                         pars++;
4662                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4663                         if ( stopmin > min + min1)
4664                             stopmin = min + min1;
4665                         flags &= ~SCF_DO_SUBSTR;
4666                         if (data)
4667                             data->flags |= SCF_SEEN_ACCEPT;
4668                     }
4669                     if (data) {
4670                         if (data_fake.flags & SF_HAS_EVAL)
4671                             data->flags |= SF_HAS_EVAL;
4672                         data->whilem_c = data_fake.whilem_c;
4673                     }
4674                     if (flags & SCF_DO_STCLASS)
4675                         cl_or(pRExC_state, &accum, &this_class);
4676                 }
4677             }
4678             if (flags & SCF_DO_SUBSTR) {
4679                 data->pos_min += min1;
4680                 data->pos_delta += max1 - min1;
4681                 if (max1 != min1 || is_inf)
4682                     data->longest = &(data->longest_float);
4683             }
4684             min += min1;
4685             delta += max1 - min1;
4686             if (flags & SCF_DO_STCLASS_OR) {
4687                 cl_or(pRExC_state, data->start_class, &accum);
4688                 if (min1) {
4689                     cl_and(data->start_class, and_withp);
4690                     flags &= ~SCF_DO_STCLASS;
4691                 }
4692             }
4693             else if (flags & SCF_DO_STCLASS_AND) {
4694                 if (min1) {
4695                     cl_and(data->start_class, &accum);
4696                     flags &= ~SCF_DO_STCLASS;
4697                 }
4698                 else {
4699                     /* Switch to OR mode: cache the old value of
4700                      * data->start_class */
4701                     INIT_AND_WITHP;
4702                     StructCopy(data->start_class, and_withp,
4703                                struct regnode_charclass_class);
4704                     flags &= ~SCF_DO_STCLASS_AND;
4705                     StructCopy(&accum, data->start_class,
4706                                struct regnode_charclass_class);
4707                     flags |= SCF_DO_STCLASS_OR;
4708                     SET_SSC_EOS(data->start_class);
4709                 }
4710             }
4711             scan= tail;
4712             continue;
4713         }
4714 #else
4715         else if (PL_regkind[OP(scan)] == TRIE) {
4716             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4717             U8*bang=NULL;
4718
4719             min += trie->minlen;
4720             delta += (trie->maxlen - trie->minlen);
4721             flags &= ~SCF_DO_STCLASS; /* xxx */
4722             if (flags & SCF_DO_SUBSTR) {
4723                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4724                 data->pos_min += trie->minlen;
4725                 data->pos_delta += (trie->maxlen - trie->minlen);
4726                 if (trie->maxlen != trie->minlen)
4727                     data->longest = &(data->longest_float);
4728             }
4729             if (trie->jump) /* no more substrings -- for now /grr*/
4730                 flags &= ~SCF_DO_SUBSTR;
4731         }
4732 #endif /* old or new */
4733 #endif /* TRIE_STUDY_OPT */
4734
4735         /* Else: zero-length, ignore. */
4736         scan = regnext(scan);
4737     }
4738     if (frame) {
4739         last = frame->last;
4740         scan = frame->next;
4741         stopparen = frame->stop;
4742         frame = frame->prev;
4743         goto fake_study_recurse;
4744     }
4745
4746   finish:
4747     assert(!frame);
4748     DEBUG_STUDYDATA("pre-fin:",data,depth);
4749
4750     *scanp = scan;
4751     *deltap = is_inf_internal ? I32_MAX : delta;
4752     if (flags & SCF_DO_SUBSTR && is_inf)
4753         data->pos_delta = I32_MAX - data->pos_min;
4754     if (is_par > (I32)U8_MAX)
4755         is_par = 0;
4756     if (is_par && pars==1 && data) {
4757         data->flags |= SF_IN_PAR;
4758         data->flags &= ~SF_HAS_PAR;
4759     }
4760     else if (pars && data) {
4761         data->flags |= SF_HAS_PAR;
4762         data->flags &= ~SF_IN_PAR;
4763     }
4764     if (flags & SCF_DO_STCLASS_OR)
4765         cl_and(data->start_class, and_withp);
4766     if (flags & SCF_TRIE_RESTUDY)
4767         data->flags |=  SCF_TRIE_RESTUDY;
4768
4769     DEBUG_STUDYDATA("post-fin:",data,depth);
4770
4771     return min < stopmin ? min : stopmin;
4772 }
4773
4774 STATIC U32
4775 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4776 {
4777     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4778
4779     PERL_ARGS_ASSERT_ADD_DATA;
4780
4781     Renewc(RExC_rxi->data,
4782            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4783            char, struct reg_data);
4784     if(count)
4785         Renew(RExC_rxi->data->what, count + n, U8);
4786     else
4787         Newx(RExC_rxi->data->what, n, U8);
4788     RExC_rxi->data->count = count + n;
4789     Copy(s, RExC_rxi->data->what + count, n, U8);
4790     return count;
4791 }
4792
4793 /*XXX: todo make this not included in a non debugging perl */
4794 #ifndef PERL_IN_XSUB_RE
4795 void
4796 Perl_reginitcolors(pTHX)
4797 {
4798     dVAR;
4799     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4800     if (s) {
4801         char *t = savepv(s);
4802         int i = 0;
4803         PL_colors[0] = t;
4804         while (++i < 6) {
4805             t = strchr(t, '\t');
4806             if (t) {
4807                 *t = '\0';
4808                 PL_colors[i] = ++t;
4809             }
4810             else
4811                 PL_colors[i] = t = (char *)"";
4812         }
4813     } else {
4814         int i = 0;
4815         while (i < 6)
4816             PL_colors[i++] = (char *)"";
4817     }
4818     PL_colorset = 1;
4819 }
4820 #endif
4821
4822
4823 #ifdef TRIE_STUDY_OPT
4824 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4825     STMT_START {                                            \
4826         if (                                                \
4827               (data.flags & SCF_TRIE_RESTUDY)               \
4828               && ! restudied++                              \
4829         ) {                                                 \
4830             dOsomething;                                    \
4831             goto reStudy;                                   \
4832         }                                                   \
4833     } STMT_END
4834 #else
4835 #define CHECK_RESTUDY_GOTO_butfirst
4836 #endif
4837
4838 /*
4839  * pregcomp - compile a regular expression into internal code
4840  *
4841  * Decides which engine's compiler to call based on the hint currently in
4842  * scope
4843  */
4844
4845 #ifndef PERL_IN_XSUB_RE
4846
4847 /* return the currently in-scope regex engine (or the default if none)  */
4848
4849 regexp_engine const *
4850 Perl_current_re_engine(pTHX)
4851 {
4852     dVAR;
4853
4854     if (IN_PERL_COMPILETIME) {
4855         HV * const table = GvHV(PL_hintgv);
4856         SV **ptr;
4857
4858         if (!table)
4859             return &PL_core_reg_engine;
4860         ptr = hv_fetchs(table, "regcomp", FALSE);
4861         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4862             return &PL_core_reg_engine;
4863         return INT2PTR(regexp_engine*,SvIV(*ptr));
4864     }
4865     else {
4866         SV *ptr;
4867         if (!PL_curcop->cop_hints_hash)
4868             return &PL_core_reg_engine;
4869         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4870         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4871             return &PL_core_reg_engine;
4872         return INT2PTR(regexp_engine*,SvIV(ptr));
4873     }
4874 }
4875
4876
4877 REGEXP *
4878 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4879 {
4880     dVAR;
4881     regexp_engine const *eng = current_re_engine();
4882     GET_RE_DEBUG_FLAGS_DECL;
4883
4884     PERL_ARGS_ASSERT_PREGCOMP;
4885
4886     /* Dispatch a request to compile a regexp to correct regexp engine. */
4887     DEBUG_COMPILE_r({
4888         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4889                         PTR2UV(eng));
4890     });
4891     return CALLREGCOMP_ENG(eng, pattern, flags);
4892 }
4893 #endif
4894
4895 /* public(ish) entry point for the perl core's own regex compiling code.
4896  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4897  * pattern rather than a list of OPs, and uses the internal engine rather
4898  * than the current one */
4899
4900 REGEXP *
4901 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4902 {
4903     SV *pat = pattern; /* defeat constness! */
4904     PERL_ARGS_ASSERT_RE_COMPILE;
4905     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4906 #ifdef PERL_IN_XSUB_RE
4907                                 &my_reg_engine,
4908 #else
4909                                 &PL_core_reg_engine,
4910 #endif
4911                                 NULL, NULL, rx_flags, 0);
4912 }
4913
4914
4915 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4916  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4917  * point to the realloced string and length.
4918  *
4919  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4920  * stuff added */
4921
4922 static void
4923 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4924                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
4925 {
4926     U8 *const src = (U8*)*pat_p;
4927     U8 *dst;
4928     int n=0;
4929     STRLEN s = 0, d = 0;
4930     bool do_end = 0;
4931     GET_RE_DEBUG_FLAGS_DECL;
4932
4933     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4934         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4935
4936     Newx(dst, *plen_p * 2 + 1, U8);
4937
4938     while (s < *plen_p) {
4939         const UV uv = NATIVE_TO_ASCII(src[s]);
4940         if (UNI_IS_INVARIANT(uv))
4941             dst[d]   = (U8)UTF_TO_NATIVE(uv);
4942         else {
4943             dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
4944             dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
4945         }
4946         if (n < num_code_blocks) {
4947             if (!do_end && pRExC_state->code_blocks[n].start == s) {
4948                 pRExC_state->code_blocks[n].start = d;
4949                 assert(dst[d] == '(');
4950                 do_end = 1;
4951             }
4952             else if (do_end && pRExC_state->code_blocks[n].end == s) {
4953                 pRExC_state->code_blocks[n].end = d;
4954                 assert(dst[d] == ')');
4955                 do_end = 0;
4956                 n++;
4957             }
4958         }
4959         s++;
4960         d++;
4961     }
4962     dst[d] = '\0';
4963     *plen_p = d;
4964     *pat_p = (char*) dst;
4965     SAVEFREEPV(*pat_p);
4966     RExC_orig_utf8 = RExC_utf8 = 1;
4967 }
4968
4969
4970
4971 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4972  * while recording any code block indices, and handling overloading,
4973  * nested qr// objects etc.  If pat is null, it will allocate a new
4974  * string, or just return the first arg, if there's only one.
4975  *
4976  * Returns the malloced/updated pat.
4977  * patternp and pat_count is the array of SVs to be concatted;
4978  * oplist is the optional list of ops that generated the SVs;
4979  * recompile_p is a pointer to a boolean that will be set if
4980  *   the regex will need to be recompiled.
4981  * delim, if non-null is an SV that will be inserted between each element
4982  */
4983
4984 static SV*
4985 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
4986                 SV *pat, SV ** const patternp, int pat_count,
4987                 OP *oplist, bool *recompile_p, SV *delim)
4988 {
4989     SV **svp;
4990     int n = 0;
4991     bool use_delim = FALSE;
4992     bool alloced = FALSE;
4993
4994     /* if we know we have at least two args, create an empty string,
4995      * then concatenate args to that. For no args, return an empty string */
4996     if (!pat && pat_count != 1) {
4997         pat = newSVpvn("", 0);
4998         SAVEFREESV(pat);
4999         alloced = TRUE;
5000     }
5001
5002     for (svp = patternp; svp < patternp + pat_count; svp++) {
5003         SV *sv;
5004         SV *rx  = NULL;
5005         STRLEN orig_patlen = 0;
5006         bool code = 0;
5007         SV *msv = use_delim ? delim : *svp;
5008
5009         /* if we've got a delimiter, we go round the loop twice for each
5010          * svp slot (except the last), using the delimiter the second
5011          * time round */
5012         if (use_delim) {
5013             svp--;
5014             use_delim = FALSE;
5015         }
5016         else if (delim)
5017             use_delim = TRUE;
5018
5019         if (SvTYPE(msv) == SVt_PVAV) {
5020             /* we've encountered an interpolated array within
5021              * the pattern, e.g. /...@a..../. Expand the list of elements,
5022              * then recursively append elements.
5023              * The code in this block is based on S_pushav() */
5024
5025             AV *const av = (AV*)msv;
5026             const I32 maxarg = AvFILL(av) + 1;
5027             SV **array;
5028
5029             if (oplist) {
5030                 assert(oplist->op_type == OP_PADAV
5031                     || oplist->op_type == OP_RV2AV);
5032                 oplist = oplist->op_sibling;;
5033             }
5034
5035             if (SvRMAGICAL(av)) {
5036                 U32 i;
5037
5038                 Newx(array, maxarg, SV*);
5039                 SAVEFREEPV(array);
5040                 for (i=0; i < (U32)maxarg; i++) {
5041                     SV ** const svp = av_fetch(av, i, FALSE);
5042                     array[i] = svp ? *svp : &PL_sv_undef;
5043                 }
5044             }
5045             else
5046                 array = AvARRAY(av);
5047
5048             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5049                                 array, maxarg, NULL, recompile_p,
5050                                 /* $" */
5051                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5052
5053             continue;
5054         }
5055
5056
5057         /* we make the assumption here that each op in the list of
5058          * op_siblings maps to one SV pushed onto the stack,
5059          * except for code blocks, with have both an OP_NULL and
5060          * and OP_CONST.
5061          * This allows us to match up the list of SVs against the
5062          * list of OPs to find the next code block.
5063          *
5064          * Note that       PUSHMARK PADSV PADSV ..
5065          * is optimised to
5066          *                 PADRANGE PADSV  PADSV  ..
5067          * so the alignment still works. */
5068
5069         if (oplist) {
5070             if (oplist->op_type == OP_NULL
5071                 && (oplist->op_flags & OPf_SPECIAL))
5072             {
5073                 assert(n < pRExC_state->num_code_blocks);
5074                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5075                 pRExC_state->code_blocks[n].block = oplist;
5076                 pRExC_state->code_blocks[n].src_regex = NULL;
5077                 n++;
5078                 code = 1;
5079                 oplist = oplist->op_sibling; /* skip CONST */
5080                 assert(oplist);
5081             }
5082             oplist = oplist->op_sibling;;
5083         }
5084
5085         /* apply magic and QR overloading to arg */
5086
5087         SvGETMAGIC(msv);
5088         if (SvROK(msv) && SvAMAGIC(msv)) {
5089             SV *sv = AMG_CALLunary(msv, regexp_amg);
5090             if (sv) {
5091                 if (SvROK(sv))
5092                     sv = SvRV(sv);
5093                 if (SvTYPE(sv) != SVt_REGEXP)
5094                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5095                 msv = sv;
5096             }
5097         }
5098
5099         /* try concatenation overload ... */
5100         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5101                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5102         {
5103             sv_setsv(pat, sv);
5104             /* overloading involved: all bets are off over literal
5105              * code. Pretend we haven't seen it */
5106             pRExC_state->num_code_blocks -= n;
5107             n = 0;
5108         }
5109         else  {
5110             /* ... or failing that, try "" overload */
5111             while (SvAMAGIC(msv)
5112                     && (sv = AMG_CALLunary(msv, string_amg))
5113                     && sv != msv
5114                     &&  !(   SvROK(msv)
5115                           && SvROK(sv)
5116                           && SvRV(msv) == SvRV(sv))
5117             ) {
5118                 msv = sv;
5119                 SvGETMAGIC(msv);
5120             }
5121             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5122                 msv = SvRV(msv);
5123
5124             if (pat) {
5125                 /* this is a partially unrolled
5126                  *     sv_catsv_nomg(pat, msv);
5127                  * that allows us to adjust code block indices if
5128                  * needed */
5129                 STRLEN dlen;
5130                 char *dst = SvPV_force_nomg(pat, dlen);
5131                 orig_patlen = dlen;
5132                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5133                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5134                     sv_setpvn(pat, dst, dlen);
5135                     SvUTF8_on(pat);
5136                 }
5137                 sv_catsv_nomg(pat, msv);
5138                 rx = msv;
5139             }
5140             else
5141                 pat = msv;
5142
5143             if (code)
5144                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5145         }
5146
5147         /* extract any code blocks within any embedded qr//'s */
5148         if (rx && SvTYPE(rx) == SVt_REGEXP
5149             && RX_ENGINE((REGEXP*)rx)->op_comp)
5150         {
5151
5152             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5153             if (ri->num_code_blocks) {
5154                 int i;
5155                 /* the presence of an embedded qr// with code means
5156                  * we should always recompile: the text of the
5157                  * qr// may not have changed, but it may be a
5158                  * different closure than last time */
5159                 *recompile_p = 1;
5160                 Renew(pRExC_state->code_blocks,
5161                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5162                     struct reg_code_block);
5163                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5164
5165                 for (i=0; i < ri->num_code_blocks; i++) {
5166                     struct reg_code_block *src, *dst;
5167                     STRLEN offset =  orig_patlen
5168                         + ReANY((REGEXP *)rx)->pre_prefix;
5169                     assert(n < pRExC_state->num_code_blocks);
5170                     src = &ri->code_blocks[i];
5171                     dst = &pRExC_state->code_blocks[n];
5172                     dst->start      = src->start + offset;
5173                     dst->end        = src->end   + offset;
5174                     dst->block      = src->block;
5175                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5176                                             src->src_regex
5177                                                 ? src->src_regex
5178                                                 : (REGEXP*)rx);
5179                     n++;
5180                 }
5181             }
5182         }
5183     }
5184     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5185     if (alloced)
5186         SvSETMAGIC(pat);
5187
5188     return pat;
5189 }
5190
5191
5192
5193 /* see if there are any run-time code blocks in the pattern.
5194  * False positives are allowed */
5195
5196 static bool
5197 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5198                     char *pat, STRLEN plen)
5199 {
5200     int n = 0;
5201     STRLEN s;
5202
5203     for (s = 0; s < plen; s++) {
5204         if (n < pRExC_state->num_code_blocks
5205             && s == pRExC_state->code_blocks[n].start)
5206         {
5207             s = pRExC_state->code_blocks[n].end;
5208             n++;
5209             continue;
5210         }
5211         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5212          * positives here */
5213         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5214             (pat[s+2] == '{'
5215                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5216         )
5217             return 1;
5218     }
5219     return 0;
5220 }
5221
5222 /* Handle run-time code blocks. We will already have compiled any direct
5223  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5224  * copy of it, but with any literal code blocks blanked out and
5225  * appropriate chars escaped; then feed it into
5226  *
5227  *    eval "qr'modified_pattern'"
5228  *
5229  * For example,
5230  *
5231  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5232  *
5233  * becomes
5234  *
5235  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5236  *
5237  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5238  * and merge them with any code blocks of the original regexp.
5239  *
5240  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5241  * instead, just save the qr and return FALSE; this tells our caller that
5242  * the original pattern needs upgrading to utf8.
5243  */
5244
5245 static bool
5246 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5247     char *pat, STRLEN plen)
5248 {
5249     SV *qr;
5250
5251     GET_RE_DEBUG_FLAGS_DECL;
5252
5253     if (pRExC_state->runtime_code_qr) {
5254         /* this is the second time we've been called; this should
5255          * only happen if the main pattern got upgraded to utf8
5256          * during compilation; re-use the qr we compiled first time
5257          * round (which should be utf8 too)
5258          */
5259         qr = pRExC_state->runtime_code_qr;
5260         pRExC_state->runtime_code_qr = NULL;
5261         assert(RExC_utf8 && SvUTF8(qr));
5262     }
5263     else {
5264         int n = 0;
5265         STRLEN s;
5266         char *p, *newpat;
5267         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5268         SV *sv, *qr_ref;
5269         dSP;
5270
5271         /* determine how many extra chars we need for ' and \ escaping */
5272         for (s = 0; s < plen; s++) {
5273             if (pat[s] == '\'' || pat[s] == '\\')
5274                 newlen++;
5275         }
5276
5277         Newx(newpat, newlen, char);
5278         p = newpat;
5279         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5280
5281         for (s = 0; s < plen; s++) {
5282             if (n < pRExC_state->num_code_blocks
5283                 && s == pRExC_state->code_blocks[n].start)
5284             {
5285                 /* blank out literal code block */
5286                 assert(pat[s] == '(');
5287                 while (s <= pRExC_state->code_blocks[n].end) {
5288                     *p++ = '_';
5289                     s++;
5290                 }
5291                 s--;
5292                 n++;
5293                 continue;
5294             }
5295             if (pat[s] == '\'' || pat[s] == '\\')
5296                 *p++ = '\\';
5297             *p++ = pat[s];
5298         }
5299         *p++ = '\'';
5300         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5301             *p++ = 'x';
5302         *p++ = '\0';
5303         DEBUG_COMPILE_r({
5304             PerlIO_printf(Perl_debug_log,
5305                 "%sre-parsing pattern for runtime code:%s %s\n",
5306                 PL_colors[4],PL_colors[5],newpat);
5307         });
5308
5309         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5310         Safefree(newpat);
5311
5312         ENTER;
5313         SAVETMPS;
5314         save_re_context();
5315         PUSHSTACKi(PERLSI_REQUIRE);
5316         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5317          * parsing qr''; normally only q'' does this. It also alters
5318          * hints handling */
5319         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5320         SvREFCNT_dec_NN(sv);
5321         SPAGAIN;
5322         qr_ref = POPs;
5323         PUTBACK;
5324         {
5325             SV * const errsv = ERRSV;
5326             if (SvTRUE_NN(errsv))
5327             {
5328                 Safefree(pRExC_state->code_blocks);
5329                 /* use croak_sv ? */
5330                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5331             }
5332         }
5333         assert(SvROK(qr_ref));
5334         qr = SvRV(qr_ref);
5335         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5336         /* the leaving below frees the tmp qr_ref.
5337          * Give qr a life of its own */
5338         SvREFCNT_inc(qr);
5339         POPSTACK;
5340         FREETMPS;
5341         LEAVE;
5342
5343     }
5344
5345     if (!RExC_utf8 && SvUTF8(qr)) {
5346         /* first time through; the pattern got upgraded; save the
5347          * qr for the next time through */
5348         assert(!pRExC_state->runtime_code_qr);
5349         pRExC_state->runtime_code_qr = qr;
5350         return 0;
5351     }
5352
5353
5354     /* extract any code blocks within the returned qr//  */
5355
5356
5357     /* merge the main (r1) and run-time (r2) code blocks into one */
5358     {
5359         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5360         struct reg_code_block *new_block, *dst;
5361         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5362         int i1 = 0, i2 = 0;
5363
5364         if (!r2->num_code_blocks) /* we guessed wrong */
5365         {
5366             SvREFCNT_dec_NN(qr);
5367             return 1;
5368         }
5369
5370         Newx(new_block,
5371             r1->num_code_blocks + r2->num_code_blocks,
5372             struct reg_code_block);
5373         dst = new_block;
5374
5375         while (    i1 < r1->num_code_blocks
5376                 || i2 < r2->num_code_blocks)
5377         {
5378             struct reg_code_block *src;
5379             bool is_qr = 0;
5380
5381             if (i1 == r1->num_code_blocks) {
5382                 src = &r2->code_blocks[i2++];
5383                 is_qr = 1;
5384             }
5385             else if (i2 == r2->num_code_blocks)
5386                 src = &r1->code_blocks[i1++];
5387             else if (  r1->code_blocks[i1].start
5388                      < r2->code_blocks[i2].start)
5389             {
5390                 src = &r1->code_blocks[i1++];
5391                 assert(src->end < r2->code_blocks[i2].start);
5392             }
5393             else {
5394                 assert(  r1->code_blocks[i1].start
5395                        > r2->code_blocks[i2].start);
5396                 src = &r2->code_blocks[i2++];
5397                 is_qr = 1;
5398                 assert(src->end < r1->code_blocks[i1].start);
5399             }
5400
5401             assert(pat[src->start] == '(');
5402             assert(pat[src->end]   == ')');
5403             dst->start      = src->start;
5404             dst->end        = src->end;
5405             dst->block      = src->block;
5406             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5407                                     : src->src_regex;
5408             dst++;
5409         }
5410         r1->num_code_blocks += r2->num_code_blocks;
5411         Safefree(r1->code_blocks);
5412         r1->code_blocks = new_block;
5413     }
5414
5415     SvREFCNT_dec_NN(qr);
5416     return 1;
5417 }
5418
5419
5420 STATIC bool
5421 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5422 {
5423     /* This is the common code for setting up the floating and fixed length
5424      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5425      * as to whether succeeded or not */
5426
5427     I32 t,ml;
5428
5429     if (! (longest_length
5430            || (eol /* Can't have SEOL and MULTI */
5431                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5432           )
5433             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5434         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5435     {
5436         return FALSE;
5437     }
5438
5439     /* copy the information about the longest from the reg_scan_data
5440         over to the program. */
5441     if (SvUTF8(sv_longest)) {
5442         *rx_utf8 = sv_longest;
5443         *rx_substr = NULL;
5444     } else {
5445         *rx_substr = sv_longest;
5446         *rx_utf8 = NULL;
5447     }
5448     /* end_shift is how many chars that must be matched that
5449         follow this item. We calculate it ahead of time as once the
5450         lookbehind offset is added in we lose the ability to correctly
5451         calculate it.*/
5452     ml = minlen ? *(minlen) : (I32)longest_length;
5453     *rx_end_shift = ml - offset
5454         - longest_length + (SvTAIL(sv_longest) != 0)
5455         + lookbehind;
5456
5457     t = (eol/* Can't have SEOL and MULTI */
5458          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5459     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5460
5461     return TRUE;
5462 }
5463
5464 /*
5465  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5466  * regular expression into internal code.
5467  * The pattern may be passed either as:
5468  *    a list of SVs (patternp plus pat_count)
5469  *    a list of OPs (expr)
5470  * If both are passed, the SV list is used, but the OP list indicates
5471  * which SVs are actually pre-compiled code blocks
5472  *
5473  * The SVs in the list have magic and qr overloading applied to them (and
5474  * the list may be modified in-place with replacement SVs in the latter
5475  * case).
5476  *
5477  * If the pattern hasn't changed from old_re, then old_re will be
5478  * returned.
5479  *
5480  * eng is the current engine. If that engine has an op_comp method, then
5481  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5482  * do the initial concatenation of arguments and pass on to the external
5483  * engine.
5484  *
5485  * If is_bare_re is not null, set it to a boolean indicating whether the
5486  * arg list reduced (after overloading) to a single bare regex which has
5487  * been returned (i.e. /$qr/).
5488  *
5489  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5490  *
5491  * pm_flags contains the PMf_* flags, typically based on those from the
5492  * pm_flags field of the related PMOP. Currently we're only interested in
5493  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5494  *
5495  * We can't allocate space until we know how big the compiled form will be,
5496  * but we can't compile it (and thus know how big it is) until we've got a
5497  * place to put the code.  So we cheat:  we compile it twice, once with code
5498  * generation turned off and size counting turned on, and once "for real".
5499  * This also means that we don't allocate space until we are sure that the
5500  * thing really will compile successfully, and we never have to move the
5501  * code and thus invalidate pointers into it.  (Note that it has to be in
5502  * one piece because free() must be able to free it all.) [NB: not true in perl]
5503  *
5504  * Beware that the optimization-preparation code in here knows about some
5505  * of the structure of the compiled regexp.  [I'll say.]
5506  */
5507
5508 REGEXP *
5509 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5510                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
5511                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5512 {
5513     dVAR;
5514     REGEXP *rx;
5515     struct regexp *r;
5516     regexp_internal *ri;
5517     STRLEN plen;
5518     char *exp;
5519     regnode *scan;
5520     I32 flags;
5521     I32 minlen = 0;
5522     U32 rx_flags;
5523     SV *pat;
5524     SV *code_blocksv = NULL;
5525     SV** new_patternp = patternp;
5526
5527     /* these are all flags - maybe they should be turned
5528      * into a single int with different bit masks */
5529     I32 sawlookahead = 0;
5530     I32 sawplus = 0;
5531     I32 sawopen = 0;
5532     I32 sawminmod = 0;
5533
5534     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5535     bool recompile = 0;
5536     bool runtime_code = 0;
5537     scan_data_t data;
5538     RExC_state_t RExC_state;
5539     RExC_state_t * const pRExC_state = &RExC_state;
5540 #ifdef TRIE_STUDY_OPT
5541     int restudied = 0;
5542     RExC_state_t copyRExC_state;
5543 #endif
5544     GET_RE_DEBUG_FLAGS_DECL;
5545
5546     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5547
5548     DEBUG_r(if (!PL_colorset) reginitcolors());
5549
5550 #ifndef PERL_IN_XSUB_RE
5551     /* Initialize these here instead of as-needed, as is quick and avoids
5552      * having to test them each time otherwise */
5553     if (! PL_AboveLatin1) {
5554         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5555         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5556         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5557
5558         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5559                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5560         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5561                                 = _new_invlist_C_array(PosixAlnum_invlist);
5562
5563         PL_L1Posix_ptrs[_CC_ALPHA]
5564                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5565         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5566
5567         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5568         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5569
5570         /* Cased is the same as Alpha in the ASCII range */
5571         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5572         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5573
5574         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5575         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5576
5577         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5578         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5579
5580         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5581         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5582
5583         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5584         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5585
5586         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5587         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5588
5589         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5590         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5591
5592         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5593         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5594         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5595         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5596
5597         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5598         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5599
5600         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5601
5602         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5603         PL_L1Posix_ptrs[_CC_WORDCHAR]
5604                                 = _new_invlist_C_array(L1PosixWord_invlist);
5605
5606         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5607         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5608
5609         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5610     }
5611 #endif
5612
5613     pRExC_state->code_blocks = NULL;
5614     pRExC_state->num_code_blocks = 0;
5615
5616     if (is_bare_re)
5617         *is_bare_re = FALSE;
5618
5619     if (expr && (expr->op_type == OP_LIST ||
5620                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5621         /* allocate code_blocks if needed */
5622         OP *o;
5623         int ncode = 0;
5624
5625         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5626             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5627                 ncode++; /* count of DO blocks */
5628         if (ncode) {
5629             pRExC_state->num_code_blocks = ncode;
5630             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5631         }
5632     }
5633
5634     if (!pat_count) {
5635         /* compile-time pattern with just OP_CONSTs and DO blocks */
5636
5637         int n;
5638         OP *o;
5639
5640         /* find how many CONSTs there are */
5641         assert(expr);
5642         n = 0;
5643         if (expr->op_type == OP_CONST)
5644             n = 1;
5645         else
5646             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5647                 if (o->op_type == OP_CONST)
5648                     n++;
5649             }
5650
5651         /* fake up an SV array */
5652
5653         assert(!new_patternp);
5654         Newx(new_patternp, n, SV*);
5655         SAVEFREEPV(new_patternp);
5656         pat_count = n;
5657
5658         n = 0;
5659         if (expr->op_type == OP_CONST)
5660             new_patternp[n] = cSVOPx_sv(expr);
5661         else
5662             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5663                 if (o->op_type == OP_CONST)
5664                     new_patternp[n++] = cSVOPo_sv;
5665             }
5666
5667     }
5668
5669     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5670         "Assembling pattern from %d elements%s\n", pat_count,
5671             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5672
5673     /* set expr to the first arg op */
5674
5675     if (pRExC_state->num_code_blocks
5676          && expr->op_type != OP_CONST)
5677     {
5678             expr = cLISTOPx(expr)->op_first;
5679             assert(   expr->op_type == OP_PUSHMARK
5680                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5681                    || expr->op_type == OP_PADRANGE);
5682             expr = expr->op_sibling;
5683     }
5684
5685     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5686                         expr, &recompile, NULL);
5687
5688     /* handle bare (possibly after overloading) regex: foo =~ $re */
5689     {
5690         SV *re = pat;
5691         if (SvROK(re))
5692             re = SvRV(re);
5693         if (SvTYPE(re) == SVt_REGEXP) {
5694             if (is_bare_re)
5695                 *is_bare_re = TRUE;
5696             SvREFCNT_inc(re);
5697             Safefree(pRExC_state->code_blocks);
5698             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5699                 "Precompiled pattern%s\n",
5700                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5701
5702             return (REGEXP*)re;
5703         }
5704     }
5705
5706     exp = SvPV_nomg(pat, plen);
5707
5708     if (!eng->op_comp) {
5709         if ((SvUTF8(pat) && IN_BYTES)
5710                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5711         {
5712             /* make a temporary copy; either to convert to bytes,
5713              * or to avoid repeating get-magic / overloaded stringify */
5714             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5715                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5716         }
5717         Safefree(pRExC_state->code_blocks);
5718         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5719     }
5720
5721     /* ignore the utf8ness if the pattern is 0 length */
5722     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5723     RExC_uni_semantics = 0;
5724     RExC_contains_locale = 0;
5725     pRExC_state->runtime_code_qr = NULL;
5726
5727     DEBUG_COMPILE_r({
5728             SV *dsv= sv_newmortal();
5729             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5730             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5731                           PL_colors[4],PL_colors[5],s);
5732         });
5733
5734   redo_first_pass:
5735     /* we jump here if we upgrade the pattern to utf8 and have to
5736      * recompile */
5737
5738     if ((pm_flags & PMf_USE_RE_EVAL)
5739                 /* this second condition covers the non-regex literal case,
5740                  * i.e.  $foo =~ '(?{})'. */
5741                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5742     )
5743         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5744
5745     /* return old regex if pattern hasn't changed */
5746     /* XXX: note in the below we have to check the flags as well as the pattern.
5747      *
5748      * Things get a touch tricky as we have to compare the utf8 flag independently
5749      * from the compile flags.
5750      */
5751
5752     if (   old_re
5753         && !recompile
5754         && !!RX_UTF8(old_re) == !!RExC_utf8
5755         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5756         && RX_PRECOMP(old_re)
5757         && RX_PRELEN(old_re) == plen
5758         && memEQ(RX_PRECOMP(old_re), exp, plen)
5759         && !runtime_code /* with runtime code, always recompile */ )
5760     {
5761         Safefree(pRExC_state->code_blocks);
5762         return old_re;
5763     }
5764
5765     rx_flags = orig_rx_flags;
5766
5767     if (initial_charset == REGEX_LOCALE_CHARSET) {
5768         RExC_contains_locale = 1;
5769     }
5770     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5771
5772         /* Set to use unicode semantics if the pattern is in utf8 and has the
5773          * 'depends' charset specified, as it means unicode when utf8  */
5774         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5775     }
5776
5777     RExC_precomp = exp;
5778     RExC_flags = rx_flags;
5779     RExC_pm_flags = pm_flags;
5780
5781     if (runtime_code) {
5782         if (TAINTING_get && TAINT_get)
5783             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5784
5785         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5786             /* whoops, we have a non-utf8 pattern, whilst run-time code
5787              * got compiled as utf8. Try again with a utf8 pattern */
5788             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5789                                     pRExC_state->num_code_blocks);
5790             goto redo_first_pass;
5791         }
5792     }
5793     assert(!pRExC_state->runtime_code_qr);
5794
5795     RExC_sawback = 0;
5796
5797     RExC_seen = 0;
5798     RExC_in_lookbehind = 0;
5799     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5800     RExC_extralen = 0;
5801     RExC_override_recoding = 0;
5802     RExC_in_multi_char_class = 0;
5803
5804     /* First pass: determine size, legality. */
5805     RExC_parse = exp;
5806     RExC_start = exp;
5807     RExC_end = exp + plen;
5808     RExC_naughty = 0;
5809     RExC_npar = 1;
5810     RExC_nestroot = 0;
5811     RExC_size = 0L;
5812     RExC_emit = &RExC_emit_dummy;
5813     RExC_whilem_seen = 0;
5814     RExC_open_parens = NULL;
5815     RExC_close_parens = NULL;
5816     RExC_opend = NULL;
5817     RExC_paren_names = NULL;
5818 #ifdef DEBUGGING
5819     RExC_paren_name_list = NULL;
5820 #endif
5821     RExC_recurse = NULL;
5822     RExC_recurse_count = 0;
5823     pRExC_state->code_index = 0;
5824
5825 #if 0 /* REGC() is (currently) a NOP at the first pass.
5826        * Clever compilers notice this and complain. --jhi */
5827     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5828 #endif
5829     DEBUG_PARSE_r(
5830         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5831         RExC_lastnum=0;
5832         RExC_lastparse=NULL;
5833     );
5834     /* reg may croak on us, not giving us a chance to free
5835        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5836        need it to survive as long as the regexp (qr/(?{})/).
5837        We must check that code_blocksv is not already set, because we may
5838        have jumped back to restart the sizing pass. */
5839     if (pRExC_state->code_blocks && !code_blocksv) {
5840         code_blocksv = newSV_type(SVt_PV);
5841         SAVEFREESV(code_blocksv);
5842         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5843         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5844     }
5845     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5846         /* It's possible to write a regexp in ascii that represents Unicode
5847         codepoints outside of the byte range, such as via \x{100}. If we
5848         detect such a sequence we have to convert the entire pattern to utf8
5849         and then recompile, as our sizing calculation will have been based
5850         on 1 byte == 1 character, but we will need to use utf8 to encode
5851         at least some part of the pattern, and therefore must convert the whole
5852         thing.
5853         -- dmq */
5854         if (flags & RESTART_UTF8) {
5855             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5856                                     pRExC_state->num_code_blocks);
5857             goto redo_first_pass;
5858         }
5859         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
5860     }
5861     if (code_blocksv)
5862         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5863
5864     DEBUG_PARSE_r({
5865         PerlIO_printf(Perl_debug_log,
5866             "Required size %"IVdf" nodes\n"
5867             "Starting second pass (creation)\n",
5868             (IV)RExC_size);
5869         RExC_lastnum=0;
5870         RExC_lastparse=NULL;
5871     });
5872
5873     /* The first pass could have found things that force Unicode semantics */
5874     if ((RExC_utf8 || RExC_uni_semantics)
5875          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5876     {
5877         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5878     }
5879
5880     /* Small enough for pointer-storage convention?
5881        If extralen==0, this means that we will not need long jumps. */
5882     if (RExC_size >= 0x10000L && RExC_extralen)
5883         RExC_size += RExC_extralen;
5884     else
5885         RExC_extralen = 0;
5886     if (RExC_whilem_seen > 15)
5887         RExC_whilem_seen = 15;
5888
5889     /* Allocate space and zero-initialize. Note, the two step process
5890        of zeroing when in debug mode, thus anything assigned has to
5891        happen after that */
5892     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5893     r = ReANY(rx);
5894     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5895          char, regexp_internal);
5896     if ( r == NULL || ri == NULL )
5897         FAIL("Regexp out of space");
5898 #ifdef DEBUGGING
5899     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5900     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5901 #else
5902     /* bulk initialize base fields with 0. */
5903     Zero(ri, sizeof(regexp_internal), char);
5904 #endif
5905
5906     /* non-zero initialization begins here */
5907     RXi_SET( r, ri );
5908     r->engine= eng;
5909     r->extflags = rx_flags;
5910     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5911
5912     if (pm_flags & PMf_IS_QR) {
5913         ri->code_blocks = pRExC_state->code_blocks;
5914         ri->num_code_blocks = pRExC_state->num_code_blocks;
5915     }
5916     else
5917     {
5918         int n;
5919         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5920             if (pRExC_state->code_blocks[n].src_regex)
5921                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5922         SAVEFREEPV(pRExC_state->code_blocks);
5923     }
5924
5925     {
5926         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5927         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5928
5929         /* The caret is output if there are any defaults: if not all the STD
5930          * flags are set, or if no character set specifier is needed */
5931         bool has_default =
5932                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5933                     || ! has_charset);
5934         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5935         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5936                             >> RXf_PMf_STD_PMMOD_SHIFT);
5937         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5938         char *p;
5939         /* Allocate for the worst case, which is all the std flags are turned
5940          * on.  If more precision is desired, we could do a population count of
5941          * the flags set.  This could be done with a small lookup table, or by
5942          * shifting, masking and adding, or even, when available, assembly
5943          * language for a machine-language population count.
5944          * We never output a minus, as all those are defaults, so are
5945          * covered by the caret */
5946         const STRLEN wraplen = plen + has_p + has_runon
5947             + has_default       /* If needs a caret */
5948
5949                 /* If needs a character set specifier */
5950             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5951             + (sizeof(STD_PAT_MODS) - 1)
5952             + (sizeof("(?:)") - 1);
5953
5954         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5955         r->xpv_len_u.xpvlenu_pv = p;
5956         if (RExC_utf8)
5957             SvFLAGS(rx) |= SVf_UTF8;
5958         *p++='('; *p++='?';
5959
5960         /* If a default, cover it using the caret */
5961         if (has_default) {
5962             *p++= DEFAULT_PAT_MOD;
5963         }
5964         if (has_charset) {
5965             STRLEN len;
5966             const char* const name = get_regex_charset_name(r->extflags, &len);
5967             Copy(name, p, len, char);
5968             p += len;
5969         }
5970         if (has_p)
5971             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5972         {
5973             char ch;
5974             while((ch = *fptr++)) {
5975                 if(reganch & 1)
5976                     *p++ = ch;
5977                 reganch >>= 1;
5978             }
5979         }
5980
5981         *p++ = ':';
5982         Copy(RExC_precomp, p, plen, char);
5983         assert ((RX_WRAPPED(rx) - p) < 16);
5984         r->pre_prefix = p - RX_WRAPPED(rx);
5985         p += plen;
5986         if (has_runon)
5987             *p++ = '\n';
5988         *p++ = ')';
5989         *p = 0;
5990         SvCUR_set(rx, p - RX_WRAPPED(rx));
5991     }
5992
5993     r->intflags = 0;
5994     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5995
5996     if (RExC_seen & REG_SEEN_RECURSE) {
5997         Newxz(RExC_open_parens, RExC_npar,regnode *);
5998         SAVEFREEPV(RExC_open_parens);
5999         Newxz(RExC_close_parens,RExC_npar,regnode *);
6000         SAVEFREEPV(RExC_close_parens);
6001     }
6002
6003     /* Useful during FAIL. */
6004 #ifdef RE_TRACK_PATTERN_OFFSETS
6005     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6006     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6007                           "%s %"UVuf" bytes for offset annotations.\n",
6008                           ri->u.offsets ? "Got" : "Couldn't get",
6009                           (UV)((2*RExC_size+1) * sizeof(U32))));
6010 #endif
6011     SetProgLen(ri,RExC_size);
6012     RExC_rx_sv = rx;
6013     RExC_rx = r;
6014     RExC_rxi = ri;
6015
6016     /* Second pass: emit code. */
6017     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6018     RExC_pm_flags = pm_flags;
6019     RExC_parse = exp;
6020     RExC_end = exp + plen;
6021     RExC_naughty = 0;
6022     RExC_npar = 1;
6023     RExC_emit_start = ri->program;
6024     RExC_emit = ri->program;
6025     RExC_emit_bound = ri->program + RExC_size + 1;
6026     pRExC_state->code_index = 0;
6027
6028     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6029     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6030         ReREFCNT_dec(rx);
6031         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6032     }
6033     /* XXXX To minimize changes to RE engine we always allocate
6034        3-units-long substrs field. */
6035     Newx(r->substrs, 1, struct reg_substr_data);
6036     if (RExC_recurse_count) {
6037         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6038         SAVEFREEPV(RExC_recurse);
6039     }
6040
6041 reStudy:
6042     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6043     Zero(r->substrs, 1, struct reg_substr_data);
6044
6045 #ifdef TRIE_STUDY_OPT
6046     if (!restudied) {
6047         StructCopy(&zero_scan_data, &data, scan_data_t);
6048         copyRExC_state = RExC_state;
6049     } else {
6050         U32 seen=RExC_seen;
6051         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6052
6053         RExC_state = copyRExC_state;
6054         if (seen & REG_TOP_LEVEL_BRANCHES)
6055             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6056         else
6057             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6058         StructCopy(&zero_scan_data, &data, scan_data_t);
6059     }
6060 #else
6061     StructCopy(&zero_scan_data, &data, scan_data_t);
6062 #endif
6063
6064     /* Dig out information for optimizations. */
6065     r->extflags = RExC_flags; /* was pm_op */
6066     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6067
6068     if (UTF)
6069         SvUTF8_on(rx);  /* Unicode in it? */
6070     ri->regstclass = NULL;
6071     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6072         r->intflags |= PREGf_NAUGHTY;
6073     scan = ri->program + 1;             /* First BRANCH. */
6074
6075     /* testing for BRANCH here tells us whether there is "must appear"
6076        data in the pattern. If there is then we can use it for optimisations */
6077     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6078         I32 fake;
6079         STRLEN longest_float_length, longest_fixed_length;
6080         struct regnode_charclass_class ch_class; /* pointed to by data */
6081         int stclass_flag;
6082         I32 last_close = 0; /* pointed to by data */
6083         regnode *first= scan;
6084         regnode *first_next= regnext(first);
6085         /*
6086          * Skip introductions and multiplicators >= 1
6087          * so that we can extract the 'meat' of the pattern that must
6088          * match in the large if() sequence following.
6089          * NOTE that EXACT is NOT covered here, as it is normally
6090          * picked up by the optimiser separately.
6091          *
6092          * This is unfortunate as the optimiser isnt handling lookahead
6093          * properly currently.
6094          *
6095          */
6096         while ((OP(first) == OPEN && (sawopen = 1)) ||
6097                /* An OR of *one* alternative - should not happen now. */
6098             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6099             /* for now we can't handle lookbehind IFMATCH*/
6100             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6101             (OP(first) == PLUS) ||
6102             (OP(first) == MINMOD) ||
6103                /* An {n,m} with n>0 */
6104             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6105             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6106         {
6107                 /*
6108                  * the only op that could be a regnode is PLUS, all the rest
6109                  * will be regnode_1 or regnode_2.
6110                  *
6111                  * (yves doesn't think this is true)
6112                  */
6113                 if (OP(first) == PLUS)
6114                     sawplus = 1;
6115                 else {
6116                     if (OP(first) == MINMOD)
6117                         sawminmod = 1;
6118                     first += regarglen[OP(first)];
6119                 }
6120                 first = NEXTOPER(first);
6121                 first_next= regnext(first);
6122         }
6123
6124         /* Starting-point info. */
6125       again:
6126         DEBUG_PEEP("first:",first,0);
6127         /* Ignore EXACT as we deal with it later. */
6128         if (PL_regkind[OP(first)] == EXACT) {
6129             if (OP(first) == EXACT)
6130                 NOOP;   /* Empty, get anchored substr later. */
6131             else
6132                 ri->regstclass = first;
6133         }
6134 #ifdef TRIE_STCLASS
6135         else if (PL_regkind[OP(first)] == TRIE &&
6136                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6137         {
6138             regnode *trie_op;
6139             /* this can happen only on restudy */
6140             if ( OP(first) == TRIE ) {
6141                 struct regnode_1 *trieop = (struct regnode_1 *)
6142                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6143                 StructCopy(first,trieop,struct regnode_1);
6144                 trie_op=(regnode *)trieop;
6145             } else {
6146                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6147                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6148                 StructCopy(first,trieop,struct regnode_charclass);
6149                 trie_op=(regnode *)trieop;
6150             }
6151             OP(trie_op)+=2;
6152             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6153             ri->regstclass = trie_op;
6154         }
6155 #endif
6156         else if (REGNODE_SIMPLE(OP(first)))
6157             ri->regstclass = first;
6158         else if (PL_regkind[OP(first)] == BOUND ||
6159                  PL_regkind[OP(first)] == NBOUND)
6160             ri->regstclass = first;
6161         else if (PL_regkind[OP(first)] == BOL) {
6162             r->extflags |= (OP(first) == MBOL
6163                            ? RXf_ANCH_MBOL
6164                            : (OP(first) == SBOL
6165                               ? RXf_ANCH_SBOL
6166                               : RXf_ANCH_BOL));
6167             first = NEXTOPER(first);
6168             goto again;
6169         }
6170         else if (OP(first) == GPOS) {
6171             r->extflags |= RXf_ANCH_GPOS;
6172             first = NEXTOPER(first);
6173             goto again;
6174         }
6175         else if ((!sawopen || !RExC_sawback) &&
6176             (OP(first) == STAR &&
6177             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6178             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6179         {
6180             /* turn .* into ^.* with an implied $*=1 */
6181             const int type =
6182                 (OP(NEXTOPER(first)) == REG_ANY)
6183                     ? RXf_ANCH_MBOL
6184                     : RXf_ANCH_SBOL;
6185             r->extflags |= type;
6186             r->intflags |= PREGf_IMPLICIT;
6187             first = NEXTOPER(first);
6188             goto again;
6189         }
6190         if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6191             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6192             /* x+ must match at the 1st pos of run of x's */
6193             r->intflags |= PREGf_SKIP;
6194
6195         /* Scan is after the zeroth branch, first is atomic matcher. */
6196 #ifdef TRIE_STUDY_OPT
6197         DEBUG_PARSE_r(
6198             if (!restudied)
6199                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6200                               (IV)(first - scan + 1))
6201         );
6202 #else
6203         DEBUG_PARSE_r(
6204             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6205                 (IV)(first - scan + 1))
6206         );
6207 #endif
6208
6209
6210         /*
6211         * If there's something expensive in the r.e., find the
6212         * longest literal string that must appear and make it the
6213         * regmust.  Resolve ties in favor of later strings, since
6214         * the regstart check works with the beginning of the r.e.
6215         * and avoiding duplication strengthens checking.  Not a
6216         * strong reason, but sufficient in the absence of others.
6217         * [Now we resolve ties in favor of the earlier string if
6218         * it happens that c_offset_min has been invalidated, since the
6219         * earlier string may buy us something the later one won't.]
6220         */
6221
6222         data.longest_fixed = newSVpvs("");
6223         data.longest_float = newSVpvs("");
6224         data.last_found = newSVpvs("");
6225         data.longest = &(data.longest_fixed);
6226         ENTER_with_name("study_chunk");
6227         SAVEFREESV(data.longest_fixed);
6228         SAVEFREESV(data.longest_float);
6229         SAVEFREESV(data.last_found);
6230         first = scan;
6231         if (!ri->regstclass) {
6232             cl_init(pRExC_state, &ch_class);
6233             data.start_class = &ch_class;
6234             stclass_flag = SCF_DO_STCLASS_AND;
6235         } else                          /* XXXX Check for BOUND? */
6236             stclass_flag = 0;
6237         data.last_closep = &last_close;
6238
6239         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6240             &data, -1, NULL, NULL,
6241             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6242                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6243             0);
6244
6245
6246         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6247
6248
6249         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6250              && data.last_start_min == 0 && data.last_end > 0
6251              && !RExC_seen_zerolen
6252              && !(RExC_seen & REG_SEEN_VERBARG)
6253              && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6254             r->extflags |= RXf_CHECK_ALL;
6255         scan_commit(pRExC_state, &data,&minlen,0);
6256
6257         longest_float_length = CHR_SVLEN(data.longest_float);
6258
6259         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6260                    && data.offset_fixed == data.offset_float_min
6261                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6262             && S_setup_longest (aTHX_ pRExC_state,
6263                                     data.longest_float,
6264                                     &(r->float_utf8),
6265                                     &(r->float_substr),
6266                                     &(r->float_end_shift),
6267                                     data.lookbehind_float,
6268                                     data.offset_float_min,
6269                                     data.minlen_float,
6270                                     longest_float_length,
6271                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6272                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6273         {
6274             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6275             r->float_max_offset = data.offset_float_max;
6276             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6277                 r->float_max_offset -= data.lookbehind_float;
6278             SvREFCNT_inc_simple_void_NN(data.longest_float);
6279         }
6280         else {
6281             r->float_substr = r->float_utf8 = NULL;
6282             longest_float_length = 0;
6283         }
6284
6285         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6286
6287         if (S_setup_longest (aTHX_ pRExC_state,
6288                                 data.longest_fixed,
6289                                 &(r->anchored_utf8),
6290                                 &(r->anchored_substr),
6291                                 &(r->anchored_end_shift),
6292                                 data.lookbehind_fixed,
6293                                 data.offset_fixed,
6294                                 data.minlen_fixed,
6295                                 longest_fixed_length,
6296                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6297                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6298         {
6299             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6300             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6301         }
6302         else {
6303             r->anchored_substr = r->anchored_utf8 = NULL;
6304             longest_fixed_length = 0;
6305         }
6306         LEAVE_with_name("study_chunk");
6307
6308         if (ri->regstclass
6309             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6310             ri->regstclass = NULL;
6311
6312         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6313             && stclass_flag
6314             && ! TEST_SSC_EOS(data.start_class)
6315             && !cl_is_anything(data.start_class))
6316         {
6317             const U32 n = add_data(pRExC_state, 1, "f");
6318             OP(data.start_class) = ANYOF_SYNTHETIC;
6319
6320             Newx(RExC_rxi->data->data[n], 1,
6321                 struct regnode_charclass_class);
6322             StructCopy(data.start_class,
6323                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6324                        struct regnode_charclass_class);
6325             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6326             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6327             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6328                       regprop(r, sv, (regnode*)data.start_class);
6329                       PerlIO_printf(Perl_debug_log,
6330                                     "synthetic stclass \"%s\".\n",
6331                                     SvPVX_const(sv));});
6332         }
6333
6334         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6335         if (longest_fixed_length > longest_float_length) {
6336             r->check_end_shift = r->anchored_end_shift;
6337             r->check_substr = r->anchored_substr;
6338             r->check_utf8 = r->anchored_utf8;
6339             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6340             if (r->extflags & RXf_ANCH_SINGLE)
6341                 r->extflags |= RXf_NOSCAN;
6342         }
6343         else {
6344             r->check_end_shift = r->float_end_shift;
6345             r->check_substr = r->float_substr;
6346             r->check_utf8 = r->float_utf8;
6347             r->check_offset_min = r->float_min_offset;
6348             r->check_offset_max = r->float_max_offset;
6349         }
6350         if ((r->check_substr || r->check_utf8) ) {
6351             r->extflags |= RXf_USE_INTUIT;
6352             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6353                 r->extflags |= RXf_INTUIT_TAIL;
6354         }
6355         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6356         if ( (STRLEN)minlen < longest_float_length )
6357             minlen= longest_float_length;
6358         if ( (STRLEN)minlen < longest_fixed_length )
6359             minlen= longest_fixed_length;
6360         */
6361     }
6362     else {
6363         /* Several toplevels. Best we can is to set minlen. */
6364         I32 fake;
6365         struct regnode_charclass_class ch_class;
6366         I32 last_close = 0;
6367
6368         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6369
6370         scan = ri->program + 1;
6371         cl_init(pRExC_state, &ch_class);
6372         data.start_class = &ch_class;
6373         data.last_closep = &last_close;
6374
6375
6376         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6377             &data, -1, NULL, NULL,
6378             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6379                               |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6380             0);
6381
6382         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6383
6384         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6385                 = r->float_substr = r->float_utf8 = NULL;
6386
6387         if (! TEST_SSC_EOS(data.start_class)
6388             && !cl_is_anything(data.start_class))
6389         {
6390             const U32 n = add_data(pRExC_state, 1, "f");
6391             OP(data.start_class) = ANYOF_SYNTHETIC;
6392
6393             Newx(RExC_rxi->data->data[n], 1,
6394                 struct regnode_charclass_class);
6395             StructCopy(data.start_class,
6396                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6397                        struct regnode_charclass_class);
6398             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6399             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6400             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6401                       regprop(r, sv, (regnode*)data.start_class);
6402                       PerlIO_printf(Perl_debug_log,
6403                                     "synthetic stclass \"%s\".\n",
6404                                     SvPVX_const(sv));});
6405         }
6406     }
6407
6408     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6409        the "real" pattern. */
6410     DEBUG_OPTIMISE_r({
6411         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6412                       (IV)minlen, (IV)r->minlen);
6413     });
6414     r->minlenret = minlen;
6415     if (r->minlen < minlen)
6416         r->minlen = minlen;
6417
6418     if (RExC_seen & REG_SEEN_GPOS)
6419         r->extflags |= RXf_GPOS_SEEN;
6420     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6421         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6422     if (pRExC_state->num_code_blocks)
6423         r->extflags |= RXf_EVAL_SEEN;
6424     if (RExC_seen & REG_SEEN_CANY)
6425         r->extflags |= RXf_CANY_SEEN;
6426     if (RExC_seen & REG_SEEN_VERBARG)
6427     {
6428         r->intflags |= PREGf_VERBARG_SEEN;
6429         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6430     }
6431     if (RExC_seen & REG_SEEN_CUTGROUP)
6432         r->intflags |= PREGf_CUTGROUP_SEEN;
6433     if (pm_flags & PMf_USE_RE_EVAL)
6434         r->intflags |= PREGf_USE_RE_EVAL;
6435     if (RExC_paren_names)
6436         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6437     else
6438         RXp_PAREN_NAMES(r) = NULL;
6439
6440     {
6441         regnode *first = ri->program + 1;
6442         U8 fop = OP(first);
6443         regnode *next = NEXTOPER(first);
6444         U8 nop = OP(next);
6445
6446         if (PL_regkind[fop] == NOTHING && nop == END)
6447             r->extflags |= RXf_NULL;
6448         else if (PL_regkind[fop] == BOL && nop == END)
6449             r->extflags |= RXf_START_ONLY;
6450         else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6451             r->extflags |= RXf_WHITE;
6452         else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6453             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6454
6455     }
6456 #ifdef DEBUGGING
6457     if (RExC_paren_names) {
6458         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6459         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6460     } else
6461 #endif
6462         ri->name_list_idx = 0;
6463
6464     if (RExC_recurse_count) {
6465         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6466             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6467             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6468         }
6469     }
6470     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6471     /* assume we don't need to swap parens around before we match */
6472
6473     DEBUG_DUMP_r({
6474         PerlIO_printf(Perl_debug_log,"Final program:\n");
6475         regdump(r);
6476     });
6477 #ifdef RE_TRACK_PATTERN_OFFSETS
6478     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6479         const U32 len = ri->u.offsets[0];
6480         U32 i;
6481         GET_RE_DEBUG_FLAGS_DECL;
6482         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6483         for (i = 1; i <= len; i++) {
6484             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6485                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6486                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6487             }
6488         PerlIO_printf(Perl_debug_log, "\n");
6489     });
6490 #endif
6491
6492 #ifdef USE_ITHREADS
6493     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6494      * by setting the regexp SV to readonly-only instead. If the
6495      * pattern's been recompiled, the USEDness should remain. */
6496     if (old_re && SvREADONLY(old_re))
6497         SvREADONLY_on(rx);
6498 #endif
6499     return rx;
6500 }
6501
6502
6503 SV*
6504 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6505                     const U32 flags)
6506 {
6507     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6508
6509     PERL_UNUSED_ARG(value);
6510
6511     if (flags & RXapif_FETCH) {
6512         return reg_named_buff_fetch(rx, key, flags);
6513     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6514         Perl_croak_no_modify();
6515         return NULL;
6516     } else if (flags & RXapif_EXISTS) {
6517         return reg_named_buff_exists(rx, key, flags)
6518             ? &PL_sv_yes
6519             : &PL_sv_no;
6520     } else if (flags & RXapif_REGNAMES) {
6521         return reg_named_buff_all(rx, flags);
6522     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6523         return reg_named_buff_scalar(rx, flags);
6524     } else {
6525         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6526         return NULL;
6527     }
6528 }
6529
6530 SV*
6531 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6532                          const U32 flags)
6533 {
6534     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6535     PERL_UNUSED_ARG(lastkey);
6536
6537     if (flags & RXapif_FIRSTKEY)
6538         return reg_named_buff_firstkey(rx, flags);
6539     else if (flags & RXapif_NEXTKEY)
6540         return reg_named_buff_nextkey(rx, flags);
6541     else {
6542         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6543         return NULL;
6544     }
6545 }
6546
6547 SV*
6548 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6549                           const U32 flags)
6550 {
6551     AV *retarray = NULL;
6552     SV *ret;
6553     struct regexp *const rx = ReANY(r);
6554
6555     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6556
6557     if (flags & RXapif_ALL)
6558         retarray=newAV();
6559
6560     if (rx && RXp_PAREN_NAMES(rx)) {
6561         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6562         if (he_str) {
6563             IV i;
6564             SV* sv_dat=HeVAL(he_str);
6565             I32 *nums=(I32*)SvPVX(sv_dat);
6566             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6567                 if ((I32)(rx->nparens) >= nums[i]
6568                     && rx->offs[nums[i]].start != -1
6569                     && rx->offs[nums[i]].end != -1)
6570                 {
6571                     ret = newSVpvs("");
6572                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6573                     if (!retarray)
6574                         return ret;
6575                 } else {
6576                     if (retarray)
6577                         ret = newSVsv(&PL_sv_undef);
6578                 }
6579                 if (retarray)
6580                     av_push(retarray, ret);
6581             }
6582             if (retarray)
6583                 return newRV_noinc(MUTABLE_SV(retarray));
6584         }
6585     }
6586     return NULL;
6587 }
6588
6589 bool
6590 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6591                            const U32 flags)
6592 {
6593     struct regexp *const rx = ReANY(r);
6594
6595     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6596
6597     if (rx && RXp_PAREN_NAMES(rx)) {
6598         if (flags & RXapif_ALL) {
6599             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6600         } else {
6601             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6602             if (sv) {
6603                 SvREFCNT_dec_NN(sv);
6604                 return TRUE;
6605             } else {
6606                 return FALSE;
6607             }
6608         }
6609     } else {
6610         return FALSE;
6611     }
6612 }
6613
6614 SV*
6615 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6616 {
6617     struct regexp *const rx = ReANY(r);
6618
6619     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6620
6621     if ( rx && RXp_PAREN_NAMES(rx) ) {
6622         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6623
6624         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6625     } else {
6626         return FALSE;
6627     }
6628 }
6629
6630 SV*
6631 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6632 {
6633     struct regexp *const rx = ReANY(r);
6634     GET_RE_DEBUG_FLAGS_DECL;
6635
6636     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6637
6638     if (rx && RXp_PAREN_NAMES(rx)) {
6639         HV *hv = RXp_PAREN_NAMES(rx);
6640         HE *temphe;
6641         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6642             IV i;
6643             IV parno = 0;
6644             SV* sv_dat = HeVAL(temphe);
6645             I32 *nums = (I32*)SvPVX(sv_dat);
6646             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6647                 if ((I32)(rx->lastparen) >= nums[i] &&
6648                     rx->offs[nums[i]].start != -1 &&
6649                     rx->offs[nums[i]].end != -1)
6650                 {
6651                     parno = nums[i];
6652                     break;
6653                 }
6654             }
6655             if (parno || flags & RXapif_ALL) {
6656                 return newSVhek(HeKEY_hek(temphe));
6657             }
6658         }
6659     }
6660     return NULL;
6661 }
6662
6663 SV*
6664 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6665 {
6666     SV *ret;
6667     AV *av;
6668     I32 length;
6669     struct regexp *const rx = ReANY(r);
6670
6671     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6672
6673     if (rx && RXp_PAREN_NAMES(rx)) {
6674         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6675             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6676         } else if (flags & RXapif_ONE) {
6677             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6678             av = MUTABLE_AV(SvRV(ret));
6679             length = av_len(av);
6680             SvREFCNT_dec_NN(ret);
6681             return newSViv(length + 1);
6682         } else {
6683             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6684             return NULL;
6685         }
6686     }
6687     return &PL_sv_undef;
6688 }
6689
6690 SV*
6691 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6692 {
6693     struct regexp *const rx = ReANY(r);
6694     AV *av = newAV();
6695
6696     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6697
6698     if (rx && RXp_PAREN_NAMES(rx)) {
6699         HV *hv= RXp_PAREN_NAMES(rx);
6700         HE *temphe;
6701         (void)hv_iterinit(hv);
6702         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6703             IV i;
6704             IV parno = 0;
6705             SV* sv_dat = HeVAL(temphe);
6706             I32 *nums = (I32*)SvPVX(sv_dat);
6707             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6708                 if ((I32)(rx->lastparen) >= nums[i] &&
6709                     rx->offs[nums[i]].start != -1 &&
6710                     rx->offs[nums[i]].end != -1)
6711                 {
6712                     parno = nums[i];
6713                     break;
6714                 }
6715             }
6716             if (parno || flags & RXapif_ALL) {
6717                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6718             }
6719         }
6720     }
6721
6722     return newRV_noinc(MUTABLE_SV(av));
6723 }
6724
6725 void
6726 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6727                              SV * const sv)
6728 {
6729     struct regexp *const rx = ReANY(r);
6730     char *s = NULL;
6731     I32 i = 0;
6732     I32 s1, t1;
6733     I32 n = paren;
6734
6735     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6736
6737     if (      n == RX_BUFF_IDX_CARET_PREMATCH
6738            || n == RX_BUFF_IDX_CARET_FULLMATCH
6739            || n == RX_BUFF_IDX_CARET_POSTMATCH
6740        )
6741     {
6742         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6743         if (!keepcopy) {
6744             /* on something like
6745              *    $r = qr/.../;
6746              *    /$qr/p;
6747              * the KEEPCOPY is set on the PMOP rather than the regex */
6748             if (PL_curpm && r == PM_GETRE(PL_curpm))
6749                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6750         }
6751         if (!keepcopy)
6752             goto ret_undef;
6753     }
6754
6755     if (!rx->subbeg)
6756         goto ret_undef;
6757
6758     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6759         /* no need to distinguish between them any more */
6760         n = RX_BUFF_IDX_FULLMATCH;
6761
6762     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6763         && rx->offs[0].start != -1)
6764     {
6765         /* $`, ${^PREMATCH} */
6766         i = rx->offs[0].start;
6767         s = rx->subbeg;
6768     }
6769     else
6770     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6771         && rx->offs[0].end != -1)
6772     {
6773         /* $', ${^POSTMATCH} */
6774         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6775         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6776     }
6777     else
6778     if ( 0 <= n && n <= (I32)rx->nparens &&
6779         (s1 = rx->offs[n].start) != -1 &&
6780         (t1 = rx->offs[n].end) != -1)
6781     {
6782         /* $&, ${^MATCH},  $1 ... */
6783         i = t1 - s1;
6784         s = rx->subbeg + s1 - rx->suboffset;
6785     } else {
6786         goto ret_undef;
6787     }
6788
6789     assert(s >= rx->subbeg);
6790     assert(rx->sublen >= (s - rx->subbeg) + i );
6791     if (i >= 0) {
6792 #if NO_TAINT_SUPPORT
6793         sv_setpvn(sv, s, i);
6794 #else
6795         const int oldtainted = TAINT_get;
6796         TAINT_NOT;
6797         sv_setpvn(sv, s, i);
6798         TAINT_set(oldtainted);
6799 #endif
6800         if ( (rx->extflags & RXf_CANY_SEEN)
6801             ? (RXp_MATCH_UTF8(rx)
6802                         && (!i || is_utf8_string((U8*)s, i)))
6803             : (RXp_MATCH_UTF8(rx)) )
6804         {
6805             SvUTF8_on(sv);
6806         }
6807         else
6808             SvUTF8_off(sv);
6809         if (TAINTING_get) {
6810             if (RXp_MATCH_TAINTED(rx)) {
6811                 if (SvTYPE(sv) >= SVt_PVMG) {
6812                     MAGIC* const mg = SvMAGIC(sv);
6813                     MAGIC* mgt;
6814                     TAINT;
6815                     SvMAGIC_set(sv, mg->mg_moremagic);
6816                     SvTAINT(sv);
6817                     if ((mgt = SvMAGIC(sv))) {
6818                         mg->mg_moremagic = mgt;
6819                         SvMAGIC_set(sv, mg);
6820                     }
6821                 } else {
6822                     TAINT;
6823                     SvTAINT(sv);
6824                 }
6825             } else
6826                 SvTAINTED_off(sv);
6827         }
6828     } else {
6829       ret_undef:
6830         sv_setsv(sv,&PL_sv_undef);
6831         return;
6832     }
6833 }
6834
6835 void
6836 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6837                                                          SV const * const value)
6838 {
6839     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6840
6841     PERL_UNUSED_ARG(rx);
6842     PERL_UNUSED_ARG(paren);
6843     PERL_UNUSED_ARG(value);
6844
6845     if (!PL_localizing)
6846         Perl_croak_no_modify();
6847 }
6848
6849 I32
6850 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6851                               const I32 paren)
6852 {
6853     struct regexp *const rx = ReANY(r);
6854     I32 i;
6855     I32 s1, t1;
6856
6857     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6858
6859     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
6860         || paren == RX_BUFF_IDX_CARET_FULLMATCH
6861         || paren == RX_BUFF_IDX_CARET_POSTMATCH
6862     )
6863     {
6864         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6865         if (!keepcopy) {
6866             /* on something like
6867              *    $r = qr/.../;
6868              *    /$qr/p;
6869              * the KEEPCOPY is set on the PMOP rather than the regex */
6870             if (PL_curpm && r == PM_GETRE(PL_curpm))
6871                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6872         }
6873         if (!keepcopy)
6874             goto warn_undef;
6875     }
6876
6877     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6878     switch (paren) {
6879       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6880       case RX_BUFF_IDX_PREMATCH:       /* $` */
6881         if (rx->offs[0].start != -1) {
6882                         i = rx->offs[0].start;
6883                         if (i > 0) {
6884                                 s1 = 0;
6885                                 t1 = i;
6886                                 goto getlen;
6887                         }
6888             }
6889         return 0;
6890
6891       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6892       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6893             if (rx->offs[0].end != -1) {
6894                         i = rx->sublen - rx->offs[0].end;
6895                         if (i > 0) {
6896                                 s1 = rx->offs[0].end;
6897                                 t1 = rx->sublen;
6898                                 goto getlen;
6899                         }
6900             }
6901         return 0;
6902
6903       default: /* $& / ${^MATCH}, $1, $2, ... */
6904             if (paren <= (I32)rx->nparens &&
6905             (s1 = rx->offs[paren].start) != -1 &&
6906             (t1 = rx->offs[paren].end) != -1)
6907             {
6908             i = t1 - s1;
6909             goto getlen;
6910         } else {
6911           warn_undef:
6912             if (ckWARN(WARN_UNINITIALIZED))
6913                 report_uninit((const SV *)sv);
6914             return 0;
6915         }
6916     }
6917   getlen:
6918     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6919         const char * const s = rx->subbeg - rx->suboffset + s1;
6920         const U8 *ep;
6921         STRLEN el;
6922
6923         i = t1 - s1;
6924         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6925                         i = el;
6926     }
6927     return i;
6928 }
6929
6930 SV*
6931 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6932 {
6933     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6934         PERL_UNUSED_ARG(rx);
6935         if (0)
6936             return NULL;
6937         else
6938             return newSVpvs("Regexp");
6939 }
6940
6941 /* Scans the name of a named buffer from the pattern.
6942  * If flags is REG_RSN_RETURN_NULL returns null.
6943  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6944  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6945  * to the parsed name as looked up in the RExC_paren_names hash.
6946  * If there is an error throws a vFAIL().. type exception.
6947  */
6948
6949 #define REG_RSN_RETURN_NULL    0
6950 #define REG_RSN_RETURN_NAME    1
6951 #define REG_RSN_RETURN_DATA    2
6952
6953 STATIC SV*
6954 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6955 {
6956     char *name_start = RExC_parse;
6957
6958     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6959
6960     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6961          /* skip IDFIRST by using do...while */
6962         if (UTF)
6963             do {
6964                 RExC_parse += UTF8SKIP(RExC_parse);
6965             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6966         else
6967             do {
6968                 RExC_parse++;
6969             } while (isWORDCHAR(*RExC_parse));
6970     } else {
6971         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6972         vFAIL("Group name must start with a non-digit word character");
6973     }
6974     if ( flags ) {
6975         SV* sv_name
6976             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6977                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6978         if ( flags == REG_RSN_RETURN_NAME)
6979             return sv_name;
6980         else if (flags==REG_RSN_RETURN_DATA) {
6981             HE *he_str = NULL;
6982             SV *sv_dat = NULL;
6983             if ( ! sv_name )      /* should not happen*/
6984                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6985             if (RExC_paren_names)
6986                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6987             if ( he_str )
6988                 sv_dat = HeVAL(he_str);
6989             if ( ! sv_dat )
6990                 vFAIL("Reference to nonexistent named group");
6991             return sv_dat;
6992         }
6993         else {
6994             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6995                        (unsigned long) flags);
6996         }
6997         assert(0); /* NOT REACHED */
6998     }
6999     return NULL;
7000 }
7001
7002 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7003     int rem=(int)(RExC_end - RExC_parse);                       \
7004     int cut;                                                    \
7005     int num;                                                    \
7006     int iscut=0;                                                \
7007     if (rem>10) {                                               \
7008         rem=10;                                                 \
7009         iscut=1;                                                \
7010     }                                                           \
7011     cut=10-rem;                                                 \
7012     if (RExC_lastparse!=RExC_parse)                             \
7013         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7014             rem, RExC_parse,                                    \
7015             cut + 4,                                            \
7016             iscut ? "..." : "<"                                 \
7017         );                                                      \
7018     else                                                        \
7019         PerlIO_printf(Perl_debug_log,"%16s","");                \
7020                                                                 \
7021     if (SIZE_ONLY)                                              \
7022        num = RExC_size + 1;                                     \
7023     else                                                        \
7024        num=REG_NODE_NUM(RExC_emit);                             \
7025     if (RExC_lastnum!=num)                                      \
7026        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7027     else                                                        \
7028        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7029     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7030         (int)((depth*2)), "",                                   \
7031         (funcname)                                              \
7032     );                                                          \
7033     RExC_lastnum=num;                                           \
7034     RExC_lastparse=RExC_parse;                                  \
7035 })
7036
7037
7038
7039 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7040     DEBUG_PARSE_MSG((funcname));                            \
7041     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7042 })
7043 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7044     DEBUG_PARSE_MSG((funcname));                            \
7045     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7046 })
7047
7048 /* This section of code defines the inversion list object and its methods.  The
7049  * interfaces are highly subject to change, so as much as possible is static to
7050  * this file.  An inversion list is here implemented as a malloc'd C UV array
7051  * as an SVt_INVLIST scalar.
7052  *
7053  * An inversion list for Unicode is an array of code points, sorted by ordinal
7054  * number.  The zeroth element is the first code point in the list.  The 1th
7055  * element is the first element beyond that not in the list.  In other words,
7056  * the first range is
7057  *  invlist[0]..(invlist[1]-1)
7058  * The other ranges follow.  Thus every element whose index is divisible by two
7059  * marks the beginning of a range that is in the list, and every element not
7060  * divisible by two marks the beginning of a range not in the list.  A single
7061  * element inversion list that contains the single code point N generally
7062  * consists of two elements
7063  *  invlist[0] == N
7064  *  invlist[1] == N+1
7065  * (The exception is when N is the highest representable value on the
7066  * machine, in which case the list containing just it would be a single
7067  * element, itself.  By extension, if the last range in the list extends to
7068  * infinity, then the first element of that range will be in the inversion list
7069  * at a position that is divisible by two, and is the final element in the
7070  * list.)
7071  * Taking the complement (inverting) an inversion list is quite simple, if the
7072  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7073  * This implementation reserves an element at the beginning of each inversion
7074  * list to always contain 0; there is an additional flag in the header which
7075  * indicates if the list begins at the 0, or is offset to begin at the next
7076  * element.
7077  *
7078  * More about inversion lists can be found in "Unicode Demystified"
7079  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7080  * More will be coming when functionality is added later.
7081  *
7082  * The inversion list data structure is currently implemented as an SV pointing
7083  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7084  * array of UV whose memory management is automatically handled by the existing
7085  * facilities for SV's.
7086  *
7087  * Some of the methods should always be private to the implementation, and some
7088  * should eventually be made public */
7089
7090 /* The header definitions are in F<inline_invlist.c> */
7091
7092 PERL_STATIC_INLINE UV*
7093 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7094 {
7095     /* Returns a pointer to the first element in the inversion list's array.
7096      * This is called upon initialization of an inversion list.  Where the
7097      * array begins depends on whether the list has the code point U+0000 in it
7098      * or not.  The other parameter tells it whether the code that follows this
7099      * call is about to put a 0 in the inversion list or not.  The first
7100      * element is either the element reserved for 0, if TRUE, or the element
7101      * after it, if FALSE */
7102
7103     bool* offset = get_invlist_offset_addr(invlist);
7104     UV* zero_addr = (UV *) SvPVX(invlist);
7105
7106     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7107
7108     /* Must be empty */
7109     assert(! _invlist_len(invlist));
7110
7111     *zero_addr = 0;
7112
7113     /* 1^1 = 0; 1^0 = 1 */
7114     *offset = 1 ^ will_have_0;
7115     return zero_addr + *offset;
7116 }
7117
7118 PERL_STATIC_INLINE UV*
7119 S_invlist_array(pTHX_ SV* const invlist)
7120 {
7121     /* Returns the pointer to the inversion list's array.  Every time the
7122      * length changes, this needs to be called in case malloc or realloc moved
7123      * it */
7124
7125     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7126
7127     /* Must not be empty.  If these fail, you probably didn't check for <len>
7128      * being non-zero before trying to get the array */
7129     assert(_invlist_len(invlist));
7130
7131     /* The very first element always contains zero, The array begins either
7132      * there, or if the inversion list is offset, at the element after it.
7133      * The offset header field determines which; it contains 0 or 1 to indicate
7134      * how much additionally to add */
7135     assert(0 == *(SvPVX(invlist)));
7136     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7137 }
7138
7139 PERL_STATIC_INLINE void
7140 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7141 {
7142     /* Sets the current number of elements stored in the inversion list.
7143      * Updates SvCUR correspondingly */
7144
7145     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7146
7147     assert(SvTYPE(invlist) == SVt_INVLIST);
7148
7149     SvCUR_set(invlist,
7150               (len == 0)
7151                ? 0
7152                : TO_INTERNAL_SIZE(len + offset));
7153     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7154 }
7155
7156 PERL_STATIC_INLINE IV*
7157 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7158 {
7159     /* Return the address of the IV that is reserved to hold the cached index
7160      * */
7161
7162     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7163
7164     assert(SvTYPE(invlist) == SVt_INVLIST);
7165
7166     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7167 }
7168
7169 PERL_STATIC_INLINE IV
7170 S_invlist_previous_index(pTHX_ SV* const invlist)
7171 {
7172     /* Returns cached index of previous search */
7173
7174     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7175
7176     return *get_invlist_previous_index_addr(invlist);
7177 }
7178
7179 PERL_STATIC_INLINE void
7180 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7181 {
7182     /* Caches <index> for later retrieval */
7183
7184     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7185
7186     assert(index == 0 || index < (int) _invlist_len(invlist));
7187
7188     *get_invlist_previous_index_addr(invlist) = index;
7189 }
7190
7191 PERL_STATIC_INLINE UV
7192 S_invlist_max(pTHX_ SV* const invlist)
7193 {
7194     /* Returns the maximum number of elements storable in the inversion list's
7195      * array, without having to realloc() */
7196
7197     PERL_ARGS_ASSERT_INVLIST_MAX;
7198
7199     assert(SvTYPE(invlist) == SVt_INVLIST);
7200
7201     /* Assumes worst case, in which the 0 element is not counted in the
7202      * inversion list, so subtracts 1 for that */
7203     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7204            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7205            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7206 }
7207
7208 #ifndef PERL_IN_XSUB_RE
7209 SV*
7210 Perl__new_invlist(pTHX_ IV initial_size)
7211 {
7212
7213     /* Return a pointer to a newly constructed inversion list, with enough
7214      * space to store 'initial_size' elements.  If that number is negative, a
7215      * system default is used instead */
7216
7217     SV* new_list;
7218
7219     if (initial_size < 0) {
7220         initial_size = 10;
7221     }
7222
7223     /* Allocate the initial space */
7224     new_list = newSV_type(SVt_INVLIST);
7225
7226     /* First 1 is in case the zero element isn't in the list; second 1 is for
7227      * trailing NUL */
7228     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7229     invlist_set_len(new_list, 0, 0);
7230
7231     /* Force iterinit() to be used to get iteration to work */
7232     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7233
7234     *get_invlist_previous_index_addr(new_list) = 0;
7235
7236     return new_list;
7237 }
7238 #endif
7239
7240 STATIC SV*
7241 S__new_invlist_C_array(pTHX_ const UV* const list)
7242 {
7243     /* Return a pointer to a newly constructed inversion list, initialized to
7244      * point to <list>, which has to be in the exact correct inversion list
7245      * form, including internal fields.  Thus this is a dangerous routine that
7246      * should not be used in the wrong hands.  The passed in 'list' contains
7247      * several header fields at the beginning that are not part of the
7248      * inversion list body proper */
7249
7250     const STRLEN length = (STRLEN) list[0];
7251     const UV version_id =          list[1];
7252     const bool offset   =    cBOOL(list[2]);
7253 #define HEADER_LENGTH 3
7254     /* If any of the above changes in any way, you must change HEADER_LENGTH
7255      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7256      *      perl -E 'say int(rand 2**31-1)'
7257      */
7258 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7259                                         data structure type, so that one being
7260                                         passed in can be validated to be an
7261                                         inversion list of the correct vintage.
7262                                        */
7263
7264     SV* invlist = newSV_type(SVt_INVLIST);
7265
7266     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7267
7268     if (version_id != INVLIST_VERSION_ID) {
7269         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7270     }
7271
7272     /* The generated array passed in includes header elements that aren't part
7273      * of the list proper, so start it just after them */
7274     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7275
7276     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7277                                shouldn't touch it */
7278
7279     *(get_invlist_offset_addr(invlist)) = offset;
7280
7281     /* The 'length' passed to us is the physical number of elements in the
7282      * inversion list.  But if there is an offset the logical number is one
7283      * less than that */
7284     invlist_set_len(invlist, length  - offset, offset);
7285
7286     invlist_set_previous_index(invlist, 0);
7287
7288     /* Initialize the iteration pointer. */
7289     invlist_iterfinish(invlist);
7290
7291     return invlist;
7292 }
7293
7294 STATIC void
7295 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7296 {
7297     /* Grow the maximum size of an inversion list */
7298
7299     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7300
7301     assert(SvTYPE(invlist) == SVt_INVLIST);
7302
7303     /* Add one to account for the zero element at the beginning which may not
7304      * be counted by the calling parameters */
7305     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7306 }
7307
7308 PERL_STATIC_INLINE void
7309 S_invlist_trim(pTHX_ SV* const invlist)
7310 {
7311     PERL_ARGS_ASSERT_INVLIST_TRIM;
7312
7313     assert(SvTYPE(invlist) == SVt_INVLIST);
7314
7315     /* Change the length of the inversion list to how many entries it currently
7316      * has */
7317     SvPV_shrink_to_cur((SV *) invlist);
7318 }
7319
7320 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7321
7322 STATIC void
7323 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7324 {
7325    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7326     * the end of the inversion list.  The range must be above any existing
7327     * ones. */
7328
7329     UV* array;
7330     UV max = invlist_max(invlist);
7331     UV len = _invlist_len(invlist);
7332     bool offset;
7333
7334     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7335
7336     if (len == 0) { /* Empty lists must be initialized */
7337         offset = start != 0;
7338         array = _invlist_array_init(invlist, ! offset);
7339     }
7340     else {
7341         /* Here, the existing list is non-empty. The current max entry in the
7342          * list is generally the first value not in the set, except when the
7343          * set extends to the end of permissible values, in which case it is
7344          * the first entry in that final set, and so this call is an attempt to
7345          * append out-of-order */
7346
7347         UV final_element = len - 1;
7348         array = invlist_array(invlist);
7349         if (array[final_element] > start
7350             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7351         {
7352             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7353                        array[final_element], start,
7354                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7355         }
7356
7357         /* Here, it is a legal append.  If the new range begins with the first
7358          * value not in the set, it is extending the set, so the new first
7359          * value not in the set is one greater than the newly extended range.
7360          * */
7361         offset = *get_invlist_offset_addr(invlist);
7362         if (array[final_element] == start) {
7363             if (end != UV_MAX) {
7364                 array[final_element] = end + 1;
7365             }
7366             else {
7367                 /* But if the end is the maximum representable on the machine,
7368                  * just let the range that this would extend to have no end */
7369                 invlist_set_len(invlist, len - 1, offset);
7370             }
7371             return;
7372         }
7373     }
7374
7375     /* Here the new range doesn't extend any existing set.  Add it */
7376
7377     len += 2;   /* Includes an element each for the start and end of range */
7378
7379     /* If wll overflow the existing space, extend, which may cause the array to
7380      * be moved */
7381     if (max < len) {
7382         invlist_extend(invlist, len);
7383
7384         /* Have to set len here to avoid assert failure in invlist_array() */
7385         invlist_set_len(invlist, len, offset);
7386
7387         array = invlist_array(invlist);
7388     }
7389     else {
7390         invlist_set_len(invlist, len, offset);
7391     }
7392
7393     /* The next item on the list starts the range, the one after that is
7394      * one past the new range.  */
7395     array[len - 2] = start;
7396     if (end != UV_MAX) {
7397         array[len - 1] = end + 1;
7398     }
7399     else {
7400         /* But if the end is the maximum representable on the machine, just let
7401          * the range have no end */
7402         invlist_set_len(invlist, len - 1, offset);
7403     }
7404 }
7405
7406 #ifndef PERL_IN_XSUB_RE
7407
7408 IV
7409 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7410 {
7411     /* Searches the inversion list for the entry that contains the input code
7412      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7413      * return value is the index into the list's array of the range that
7414      * contains <cp> */
7415
7416     IV low = 0;
7417     IV mid;
7418     IV high = _invlist_len(invlist);
7419     const IV highest_element = high - 1;
7420     const UV* array;
7421
7422     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7423
7424     /* If list is empty, return failure. */
7425     if (high == 0) {
7426         return -1;
7427     }
7428
7429     /* (We can't get the array unless we know the list is non-empty) */
7430     array = invlist_array(invlist);
7431
7432     mid = invlist_previous_index(invlist);
7433     assert(mid >=0 && mid <= highest_element);
7434
7435     /* <mid> contains the cache of the result of the previous call to this
7436      * function (0 the first time).  See if this call is for the same result,
7437      * or if it is for mid-1.  This is under the theory that calls to this
7438      * function will often be for related code points that are near each other.
7439      * And benchmarks show that caching gives better results.  We also test
7440      * here if the code point is within the bounds of the list.  These tests
7441      * replace others that would have had to be made anyway to make sure that
7442      * the array bounds were not exceeded, and these give us extra information
7443      * at the same time */
7444     if (cp >= array[mid]) {
7445         if (cp >= array[highest_element]) {
7446             return highest_element;
7447         }
7448
7449         /* Here, array[mid] <= cp < array[highest_element].  This means that
7450          * the final element is not the answer, so can exclude it; it also
7451          * means that <mid> is not the final element, so can refer to 'mid + 1'
7452          * safely */
7453         if (cp < array[mid + 1]) {
7454             return mid;
7455         }
7456         high--;
7457         low = mid + 1;
7458     }
7459     else { /* cp < aray[mid] */
7460         if (cp < array[0]) { /* Fail if outside the array */
7461             return -1;
7462         }
7463         high = mid;
7464         if (cp >= array[mid - 1]) {
7465             goto found_entry;
7466         }
7467     }
7468
7469     /* Binary search.  What we are looking for is <i> such that
7470      *  array[i] <= cp < array[i+1]
7471      * The loop below converges on the i+1.  Note that there may not be an
7472      * (i+1)th element in the array, and things work nonetheless */
7473     while (low < high) {
7474         mid = (low + high) / 2;
7475         assert(mid <= highest_element);
7476         if (array[mid] <= cp) { /* cp >= array[mid] */
7477             low = mid + 1;
7478
7479             /* We could do this extra test to exit the loop early.
7480             if (cp < array[low]) {
7481                 return mid;
7482             }
7483             */
7484         }
7485         else { /* cp < array[mid] */
7486             high = mid;
7487         }
7488     }
7489
7490   found_entry:
7491     high--;
7492     invlist_set_previous_index(invlist, high);
7493     return high;
7494 }
7495
7496 void
7497 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7498 {
7499     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7500      * but is used when the swash has an inversion list.  This makes this much
7501      * faster, as it uses a binary search instead of a linear one.  This is
7502      * intimately tied to that function, and perhaps should be in utf8.c,
7503      * except it is intimately tied to inversion lists as well.  It assumes
7504      * that <swatch> is all 0's on input */
7505
7506     UV current = start;
7507     const IV len = _invlist_len(invlist);
7508     IV i;
7509     const UV * array;
7510
7511     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7512
7513     if (len == 0) { /* Empty inversion list */
7514         return;
7515     }
7516
7517     array = invlist_array(invlist);
7518
7519     /* Find which element it is */
7520     i = _invlist_search(invlist, start);
7521
7522     /* We populate from <start> to <end> */
7523     while (current < end) {
7524         UV upper;
7525
7526         /* The inversion list gives the results for every possible code point
7527          * after the first one in the list.  Only those ranges whose index is
7528          * even are ones that the inversion list matches.  For the odd ones,
7529          * and if the initial code point is not in the list, we have to skip
7530          * forward to the next element */
7531         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7532             i++;
7533             if (i >= len) { /* Finished if beyond the end of the array */
7534                 return;
7535             }
7536             current = array[i];
7537             if (current >= end) {   /* Finished if beyond the end of what we
7538                                        are populating */
7539                 if (LIKELY(end < UV_MAX)) {
7540                     return;
7541                 }
7542
7543                 /* We get here when the upper bound is the maximum
7544                  * representable on the machine, and we are looking for just
7545                  * that code point.  Have to special case it */
7546                 i = len;
7547                 goto join_end_of_list;
7548             }
7549         }
7550         assert(current >= start);
7551
7552         /* The current range ends one below the next one, except don't go past
7553          * <end> */
7554         i++;
7555         upper = (i < len && array[i] < end) ? array[i] : end;
7556
7557         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7558          * for each code point in it */
7559         for (; current < upper; current++) {
7560             const STRLEN offset = (STRLEN)(current - start);
7561             swatch[offset >> 3] |= 1 << (offset & 7);
7562         }
7563
7564     join_end_of_list:
7565
7566         /* Quit if at the end of the list */
7567         if (i >= len) {
7568
7569             /* But first, have to deal with the highest possible code point on
7570              * the platform.  The previous code assumes that <end> is one
7571              * beyond where we want to populate, but that is impossible at the
7572              * platform's infinity, so have to handle it specially */
7573             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7574             {
7575                 const STRLEN offset = (STRLEN)(end - start);
7576                 swatch[offset >> 3] |= 1 << (offset & 7);
7577             }
7578             return;
7579         }
7580
7581         /* Advance to the next range, which will be for code points not in the
7582          * inversion list */
7583         current = array[i];
7584     }
7585
7586     return;
7587 }
7588
7589 void
7590 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7591 {
7592     /* Take the union of two inversion lists and point <output> to it.  *output
7593      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7594      * the reference count to that list will be decremented.  The first list,
7595      * <a>, may be NULL, in which case a copy of the second list is returned.
7596      * If <complement_b> is TRUE, the union is taken of the complement
7597      * (inversion) of <b> instead of b itself.
7598      *
7599      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7600      * Richard Gillam, published by Addison-Wesley, and explained at some
7601      * length there.  The preface says to incorporate its examples into your
7602      * code at your own risk.
7603      *
7604      * The algorithm is like a merge sort.
7605      *
7606      * XXX A potential performance improvement is to keep track as we go along
7607      * if only one of the inputs contributes to the result, meaning the other
7608      * is a subset of that one.  In that case, we can skip the final copy and
7609      * return the larger of the input lists, but then outside code might need
7610      * to keep track of whether to free the input list or not */
7611
7612     const UV* array_a;    /* a's array */
7613     const UV* array_b;
7614     UV len_a;       /* length of a's array */
7615     UV len_b;
7616
7617     SV* u;                      /* the resulting union */
7618     UV* array_u;
7619     UV len_u;
7620
7621     UV i_a = 0;             /* current index into a's array */
7622     UV i_b = 0;
7623     UV i_u = 0;
7624
7625     /* running count, as explained in the algorithm source book; items are
7626      * stopped accumulating and are output when the count changes to/from 0.
7627      * The count is incremented when we start a range that's in the set, and
7628      * decremented when we start a range that's not in the set.  So its range
7629      * is 0 to 2.  Only when the count is zero is something not in the set.
7630      */
7631     UV count = 0;
7632
7633     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7634     assert(a != b);
7635
7636     /* If either one is empty, the union is the other one */
7637     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7638         if (*output == a) {
7639             if (a != NULL) {
7640                 SvREFCNT_dec_NN(a);
7641             }
7642         }
7643         if (*output != b) {
7644             *output = invlist_clone(b);
7645             if (complement_b) {
7646                 _invlist_invert(*output);
7647             }
7648         } /* else *output already = b; */
7649         return;
7650     }
7651     else if ((len_b = _invlist_len(b)) == 0) {
7652         if (*output == b) {
7653             SvREFCNT_dec_NN(b);
7654         }
7655
7656         /* The complement of an empty list is a list that has everything in it,
7657          * so the union with <a> includes everything too */
7658         if (complement_b) {
7659             if (a == *output) {
7660                 SvREFCNT_dec_NN(a);
7661             }
7662             *output = _new_invlist(1);
7663             _append_range_to_invlist(*output, 0, UV_MAX);
7664         }
7665         else if (*output != a) {
7666             *output = invlist_clone(a);
7667         }
7668         /* else *output already = a; */
7669         return;
7670     }
7671
7672     /* Here both lists exist and are non-empty */
7673     array_a = invlist_array(a);
7674     array_b = invlist_array(b);
7675
7676     /* If are to take the union of 'a' with the complement of b, set it
7677      * up so are looking at b's complement. */
7678     if (complement_b) {
7679
7680         /* To complement, we invert: if the first element is 0, remove it.  To
7681          * do this, we just pretend the array starts one later */
7682         if (array_b[0] == 0) {
7683             array_b++;
7684             len_b--;
7685         }
7686         else {
7687
7688             /* But if the first element is not zero, we pretend the list starts
7689              * at the 0 that is always stored immediately before the array. */
7690             array_b--;
7691             len_b++;
7692         }
7693     }
7694
7695     /* Size the union for the worst case: that the sets are completely
7696      * disjoint */
7697     u = _new_invlist(len_a + len_b);
7698
7699     /* Will contain U+0000 if either component does */
7700     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7701                                       || (len_b > 0 && array_b[0] == 0));
7702
7703     /* Go through each list item by item, stopping when exhausted one of
7704      * them */
7705     while (i_a < len_a && i_b < len_b) {
7706         UV cp;      /* The element to potentially add to the union's array */
7707         bool cp_in_set;   /* is it in the the input list's set or not */
7708
7709         /* We need to take one or the other of the two inputs for the union.
7710          * Since we are merging two sorted lists, we take the smaller of the
7711          * next items.  In case of a tie, we take the one that is in its set
7712          * first.  If we took one not in the set first, it would decrement the
7713          * count, possibly to 0 which would cause it to be output as ending the
7714          * range, and the next time through we would take the same number, and
7715          * output it again as beginning the next range.  By doing it the
7716          * opposite way, there is no possibility that the count will be
7717          * momentarily decremented to 0, and thus the two adjoining ranges will
7718          * be seamlessly merged.  (In a tie and both are in the set or both not
7719          * in the set, it doesn't matter which we take first.) */
7720         if (array_a[i_a] < array_b[i_b]
7721             || (array_a[i_a] == array_b[i_b]
7722                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7723         {
7724             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7725             cp= array_a[i_a++];
7726         }
7727         else {
7728             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7729             cp = array_b[i_b++];
7730         }
7731
7732         /* Here, have chosen which of the two inputs to look at.  Only output
7733          * if the running count changes to/from 0, which marks the
7734          * beginning/end of a range in that's in the set */
7735         if (cp_in_set) {
7736             if (count == 0) {
7737                 array_u[i_u++] = cp;
7738             }
7739             count++;
7740         }
7741         else {
7742             count--;
7743             if (count == 0) {
7744                 array_u[i_u++] = cp;
7745             }
7746         }
7747     }
7748
7749     /* Here, we are finished going through at least one of the lists, which
7750      * means there is something remaining in at most one.  We check if the list
7751      * that hasn't been exhausted is positioned such that we are in the middle
7752      * of a range in its set or not.  (i_a and i_b point to the element beyond
7753      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7754      * is potentially more to output.
7755      * There are four cases:
7756      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7757      *     in the union is entirely from the non-exhausted set.
7758      *  2) Both were in their sets, count is 2.  Nothing further should
7759      *     be output, as everything that remains will be in the exhausted
7760      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7761      *     that
7762      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7763      *     Nothing further should be output because the union includes
7764      *     everything from the exhausted set.  Not decrementing ensures that.
7765      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7766      *     decrementing to 0 insures that we look at the remainder of the
7767      *     non-exhausted set */
7768     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7769         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7770     {
7771         count--;
7772     }
7773
7774     /* The final length is what we've output so far, plus what else is about to
7775      * be output.  (If 'count' is non-zero, then the input list we exhausted
7776      * has everything remaining up to the machine's limit in its set, and hence
7777      * in the union, so there will be no further output. */
7778     len_u = i_u;
7779     if (count == 0) {
7780         /* At most one of the subexpressions will be non-zero */
7781         len_u += (len_a - i_a) + (len_b - i_b);
7782     }
7783
7784     /* Set result to final length, which can change the pointer to array_u, so
7785      * re-find it */
7786     if (len_u != _invlist_len(u)) {
7787         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
7788         invlist_trim(u);
7789         array_u = invlist_array(u);
7790     }
7791
7792     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7793      * the other) ended with everything above it not in its set.  That means
7794      * that the remaining part of the union is precisely the same as the
7795      * non-exhausted list, so can just copy it unchanged.  (If both list were
7796      * exhausted at the same time, then the operations below will be both 0.)
7797      */
7798     if (count == 0) {
7799         IV copy_count; /* At most one will have a non-zero copy count */
7800         if ((copy_count = len_a - i_a) > 0) {
7801             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7802         }
7803         else if ((copy_count = len_b - i_b) > 0) {
7804             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7805         }
7806     }
7807
7808     /*  We may be removing a reference to one of the inputs */
7809     if (a == *output || b == *output) {
7810         assert(! invlist_is_iterating(*output));
7811         SvREFCNT_dec_NN(*output);
7812     }
7813
7814     *output = u;
7815     return;
7816 }
7817
7818 void
7819 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
7820 {
7821     /* Take the intersection of two inversion lists and point <i> to it.  *i
7822      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7823      * the reference count to that list will be decremented.
7824      * If <complement_b> is TRUE, the result will be the intersection of <a>
7825      * and the complement (or inversion) of <b> instead of <b> directly.
7826      *
7827      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7828      * Richard Gillam, published by Addison-Wesley, and explained at some
7829      * length there.  The preface says to incorporate its examples into your
7830      * code at your own risk.  In fact, it had bugs
7831      *
7832      * The algorithm is like a merge sort, and is essentially the same as the
7833      * union above
7834      */
7835
7836     const UV* array_a;          /* a's array */
7837     const UV* array_b;
7838     UV len_a;   /* length of a's array */
7839     UV len_b;
7840
7841     SV* r;                   /* the resulting intersection */
7842     UV* array_r;
7843     UV len_r;
7844
7845     UV i_a = 0;             /* current index into a's array */
7846     UV i_b = 0;
7847     UV i_r = 0;
7848
7849     /* running count, as explained in the algorithm source book; items are
7850      * stopped accumulating and are output when the count changes to/from 2.
7851      * The count is incremented when we start a range that's in the set, and
7852      * decremented when we start a range that's not in the set.  So its range
7853      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7854      */
7855     UV count = 0;
7856
7857     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7858     assert(a != b);
7859
7860     /* Special case if either one is empty */
7861     len_a = (a == NULL) ? 0 : _invlist_len(a);
7862     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7863
7864         if (len_a != 0 && complement_b) {
7865
7866             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7867              * be empty.  Here, also we are using 'b's complement, which hence
7868              * must be every possible code point.  Thus the intersection is
7869              * simply 'a'. */
7870             if (*i != a) {
7871                 if (*i == b) {
7872                     SvREFCNT_dec_NN(b);
7873                 }
7874
7875                 *i = invlist_clone(a);
7876             }
7877             /* else *i is already 'a' */
7878             return;
7879         }
7880
7881         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7882          * intersection must be empty */
7883         if (*i == a) {
7884             SvREFCNT_dec_NN(a);
7885         }
7886         else if (*i == b) {
7887             SvREFCNT_dec_NN(b);
7888         }
7889         *i = _new_invlist(0);
7890         return;
7891     }
7892
7893     /* Here both lists exist and are non-empty */
7894     array_a = invlist_array(a);
7895     array_b = invlist_array(b);
7896
7897     /* If are to take the intersection of 'a' with the complement of b, set it
7898      * up so are looking at b's complement. */
7899     if (complement_b) {
7900
7901         /* To complement, we invert: if the first element is 0, remove it.  To
7902          * do this, we just pretend the array starts one later */
7903         if (array_b[0] == 0) {
7904             array_b++;
7905             len_b--;
7906         }
7907         else {
7908
7909             /* But if the first element is not zero, we pretend the list starts
7910              * at the 0 that is always stored immediately before the array. */
7911             array_b--;
7912             len_b++;
7913         }
7914     }
7915
7916     /* Size the intersection for the worst case: that the intersection ends up
7917      * fragmenting everything to be completely disjoint */
7918     r= _new_invlist(len_a + len_b);
7919
7920     /* Will contain U+0000 iff both components do */
7921     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7922                                      && len_b > 0 && array_b[0] == 0);
7923
7924     /* Go through each list item by item, stopping when exhausted one of
7925      * them */
7926     while (i_a < len_a && i_b < len_b) {
7927         UV cp;      /* The element to potentially add to the intersection's
7928                        array */
7929         bool cp_in_set; /* Is it in the input list's set or not */
7930
7931         /* We need to take one or the other of the two inputs for the
7932          * intersection.  Since we are merging two sorted lists, we take the
7933          * smaller of the next items.  In case of a tie, we take the one that
7934          * is not in its set first (a difference from the union algorithm).  If
7935          * we took one in the set first, it would increment the count, possibly
7936          * to 2 which would cause it to be output as starting a range in the
7937          * intersection, and the next time through we would take that same
7938          * number, and output it again as ending the set.  By doing it the
7939          * opposite of this, there is no possibility that the count will be
7940          * momentarily incremented to 2.  (In a tie and both are in the set or
7941          * both not in the set, it doesn't matter which we take first.) */
7942         if (array_a[i_a] < array_b[i_b]
7943             || (array_a[i_a] == array_b[i_b]
7944                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7945         {
7946             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7947             cp= array_a[i_a++];
7948         }
7949         else {
7950             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7951             cp= array_b[i_b++];
7952         }
7953
7954         /* Here, have chosen which of the two inputs to look at.  Only output
7955          * if the running count changes to/from 2, which marks the
7956          * beginning/end of a range that's in the intersection */
7957         if (cp_in_set) {
7958             count++;
7959             if (count == 2) {
7960                 array_r[i_r++] = cp;
7961             }
7962         }
7963         else {
7964             if (count == 2) {
7965                 array_r[i_r++] = cp;
7966             }
7967             count--;
7968         }
7969     }
7970
7971     /* Here, we are finished going through at least one of the lists, which
7972      * means there is something remaining in at most one.  We check if the list
7973      * that has been exhausted is positioned such that we are in the middle
7974      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7975      * the ones we care about.)  There are four cases:
7976      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7977      *     nothing left in the intersection.
7978      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7979      *     above 2.  What should be output is exactly that which is in the
7980      *     non-exhausted set, as everything it has is also in the intersection
7981      *     set, and everything it doesn't have can't be in the intersection
7982      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7983      *     gets incremented to 2.  Like the previous case, the intersection is
7984      *     everything that remains in the non-exhausted set.
7985      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7986      *     remains 1.  And the intersection has nothing more. */
7987     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7988         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7989     {
7990         count++;
7991     }
7992
7993     /* The final length is what we've output so far plus what else is in the
7994      * intersection.  At most one of the subexpressions below will be non-zero */
7995     len_r = i_r;
7996     if (count >= 2) {
7997         len_r += (len_a - i_a) + (len_b - i_b);
7998     }
7999
8000     /* Set result to final length, which can change the pointer to array_r, so
8001      * re-find it */
8002     if (len_r != _invlist_len(r)) {
8003         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8004         invlist_trim(r);
8005         array_r = invlist_array(r);
8006     }
8007
8008     /* Finish outputting any remaining */
8009     if (count >= 2) { /* At most one will have a non-zero copy count */
8010         IV copy_count;
8011         if ((copy_count = len_a - i_a) > 0) {
8012             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8013         }
8014         else if ((copy_count = len_b - i_b) > 0) {
8015             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8016         }
8017     }
8018
8019     /*  We may be removing a reference to one of the inputs */
8020     if (a == *i || b == *i) {
8021         assert(! invlist_is_iterating(*i));
8022         SvREFCNT_dec_NN(*i);
8023     }
8024
8025     *i = r;
8026     return;
8027 }
8028
8029 SV*
8030 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8031 {
8032     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8033      * set.  A pointer to the inversion list is returned.  This may actually be
8034      * a new list, in which case the passed in one has been destroyed.  The
8035      * passed in inversion list can be NULL, in which case a new one is created
8036      * with just the one range in it */
8037
8038     SV* range_invlist;
8039     UV len;
8040
8041     if (invlist == NULL) {
8042         invlist = _new_invlist(2);
8043         len = 0;
8044     }
8045     else {
8046         len = _invlist_len(invlist);
8047     }
8048
8049     /* If comes after the final entry actually in the list, can just append it
8050      * to the end, */
8051     if (len == 0
8052         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8053             && start >= invlist_array(invlist)[len - 1]))
8054     {
8055         _append_range_to_invlist(invlist, start, end);
8056         return invlist;
8057     }
8058
8059     /* Here, can't just append things, create and return a new inversion list
8060      * which is the union of this range and the existing inversion list */
8061     range_invlist = _new_invlist(2);
8062     _append_range_to_invlist(range_invlist, start, end);
8063
8064     _invlist_union(invlist, range_invlist, &invlist);
8065
8066     /* The temporary can be freed */
8067     SvREFCNT_dec_NN(range_invlist);
8068
8069     return invlist;
8070 }
8071
8072 #endif
8073
8074 PERL_STATIC_INLINE SV*
8075 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8076     return _add_range_to_invlist(invlist, cp, cp);
8077 }
8078
8079 #ifndef PERL_IN_XSUB_RE
8080 void
8081 Perl__invlist_invert(pTHX_ SV* const invlist)
8082 {
8083     /* Complement the input inversion list.  This adds a 0 if the list didn't
8084      * have a zero; removes it otherwise.  As described above, the data
8085      * structure is set up so that this is very efficient */
8086
8087     PERL_ARGS_ASSERT__INVLIST_INVERT;
8088
8089     assert(! invlist_is_iterating(invlist));
8090
8091     /* The inverse of matching nothing is matching everything */
8092     if (_invlist_len(invlist) == 0) {
8093         _append_range_to_invlist(invlist, 0, UV_MAX);
8094         return;
8095     }
8096
8097     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8098 }
8099
8100 void
8101 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8102 {
8103     /* Complement the input inversion list (which must be a Unicode property,
8104      * all of which don't match above the Unicode maximum code point.)  And
8105      * Perl has chosen to not have the inversion match above that either.  This
8106      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8107      */
8108
8109     UV len;
8110     UV* array;
8111
8112     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8113
8114     _invlist_invert(invlist);
8115
8116     len = _invlist_len(invlist);
8117
8118     if (len != 0) { /* If empty do nothing */
8119         array = invlist_array(invlist);
8120         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8121             /* Add 0x110000.  First, grow if necessary */
8122             len++;
8123             if (invlist_max(invlist) < len) {
8124                 invlist_extend(invlist, len);
8125                 array = invlist_array(invlist);
8126             }
8127             invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8128             array[len - 1] = PERL_UNICODE_MAX + 1;
8129         }
8130         else {  /* Remove the 0x110000 */
8131             invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8132         }
8133     }
8134
8135     return;
8136 }
8137 #endif
8138
8139 PERL_STATIC_INLINE SV*
8140 S_invlist_clone(pTHX_ SV* const invlist)
8141 {
8142
8143     /* Return a new inversion list that is a copy of the input one, which is
8144      * unchanged */
8145
8146     /* Need to allocate extra space to accommodate Perl's addition of a
8147      * trailing NUL to SvPV's, since it thinks they are always strings */
8148     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8149     STRLEN physical_length = SvCUR(invlist);
8150     bool offset = *(get_invlist_offset_addr(invlist));
8151
8152     PERL_ARGS_ASSERT_INVLIST_CLONE;
8153
8154     *(get_invlist_offset_addr(new_invlist)) = offset;
8155     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8156     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8157
8158     return new_invlist;
8159 }
8160
8161 PERL_STATIC_INLINE STRLEN*
8162 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8163 {
8164     /* Return the address of the UV that contains the current iteration
8165      * position */
8166
8167     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8168
8169     assert(SvTYPE(invlist) == SVt_INVLIST);
8170
8171     return &(((XINVLIST*) SvANY(invlist))->iterator);
8172 }
8173
8174 PERL_STATIC_INLINE void
8175 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8176 {
8177     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8178
8179     *get_invlist_iter_addr(invlist) = 0;
8180 }
8181
8182 PERL_STATIC_INLINE void
8183 S_invlist_iterfinish(pTHX_ SV* invlist)
8184 {
8185     /* Terminate iterator for invlist.  This is to catch development errors.
8186      * Any iteration that is interrupted before completed should call this
8187      * function.  Functions that add code points anywhere else but to the end
8188      * of an inversion list assert that they are not in the middle of an
8189      * iteration.  If they were, the addition would make the iteration
8190      * problematical: if the iteration hadn't reached the place where things
8191      * were being added, it would be ok */
8192
8193     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8194
8195     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8196 }
8197
8198 STATIC bool
8199 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8200 {
8201     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8202      * This call sets in <*start> and <*end>, the next range in <invlist>.
8203      * Returns <TRUE> if successful and the next call will return the next
8204      * range; <FALSE> if was already at the end of the list.  If the latter,
8205      * <*start> and <*end> are unchanged, and the next call to this function
8206      * will start over at the beginning of the list */
8207
8208     STRLEN* pos = get_invlist_iter_addr(invlist);
8209     UV len = _invlist_len(invlist);
8210     UV *array;
8211
8212     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8213
8214     if (*pos >= len) {
8215         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8216         return FALSE;
8217     }
8218
8219     array = invlist_array(invlist);
8220
8221     *start = array[(*pos)++];
8222
8223     if (*pos >= len) {
8224         *end = UV_MAX;
8225     }
8226     else {
8227         *end = array[(*pos)++] - 1;
8228     }
8229
8230     return TRUE;
8231 }
8232
8233 PERL_STATIC_INLINE bool
8234 S_invlist_is_iterating(pTHX_ SV* const invlist)
8235 {
8236     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8237
8238     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8239 }
8240
8241 PERL_STATIC_INLINE UV
8242 S_invlist_highest(pTHX_ SV* const invlist)
8243 {
8244     /* Returns the highest code point that matches an inversion list.  This API
8245      * has an ambiguity, as it returns 0 under either the highest is actually
8246      * 0, or if the list is empty.  If this distinction matters to you, check
8247      * for emptiness before calling this function */
8248
8249     UV len = _invlist_len(invlist);
8250     UV *array;
8251
8252     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8253
8254     if (len == 0) {
8255         return 0;
8256     }
8257
8258     array = invlist_array(invlist);
8259
8260     /* The last element in the array in the inversion list always starts a
8261      * range that goes to infinity.  That range may be for code points that are
8262      * matched in the inversion list, or it may be for ones that aren't
8263      * matched.  In the latter case, the highest code point in the set is one
8264      * less than the beginning of this range; otherwise it is the final element
8265      * of this range: infinity */
8266     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8267            ? UV_MAX
8268            : array[len - 1] - 1;
8269 }
8270
8271 #ifndef PERL_IN_XSUB_RE
8272 SV *
8273 Perl__invlist_contents(pTHX_ SV* const invlist)
8274 {
8275     /* Get the contents of an inversion list into a string SV so that they can
8276      * be printed out.  It uses the format traditionally done for debug tracing
8277      */
8278
8279     UV start, end;
8280     SV* output = newSVpvs("\n");
8281
8282     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8283
8284     assert(! invlist_is_iterating(invlist));
8285
8286     invlist_iterinit(invlist);
8287     while (invlist_iternext(invlist, &start, &end)) {
8288         if (end == UV_MAX) {
8289             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8290         }
8291         else if (end != start) {
8292             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8293                     start,       end);
8294         }
8295         else {
8296             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8297         }
8298     }
8299
8300     return output;
8301 }
8302 #endif
8303
8304 #ifndef PERL_IN_XSUB_RE
8305 void
8306 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8307 {
8308     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
8309      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
8310      * the string 'indent'.  The output looks like this:
8311          [0] 0x000A .. 0x000D
8312          [2] 0x0085
8313          [4] 0x2028 .. 0x2029
8314          [6] 0x3104 .. INFINITY
8315      * This means that the first range of code points matched by the list are
8316      * 0xA through 0xD; the second range contains only the single code point
8317      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
8318      * are used to define each range (except if the final range extends to
8319      * infinity, only a single element is needed).  The array index of the
8320      * first element for the corresponding range is given in brackets. */
8321
8322     UV start, end;
8323     STRLEN count = 0;
8324
8325     PERL_ARGS_ASSERT__INVLIST_DUMP;
8326
8327     if (invlist_is_iterating(invlist)) {
8328         Perl_dump_indent(aTHX_ level, file,
8329              "%sCan't dump inversion list because is in middle of iterating\n",
8330              indent);
8331         return;
8332     }
8333
8334     invlist_iterinit(invlist);
8335     while (invlist_iternext(invlist, &start, &end)) {
8336         if (end == UV_MAX) {
8337             Perl_dump_indent(aTHX_ level, file,
8338                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8339                                    indent, (UV)count, start);
8340         }
8341         else if (end != start) {
8342             Perl_dump_indent(aTHX_ level, file,
8343                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8344                                 indent, (UV)count, start,         end);
8345         }
8346         else {
8347             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8348                                             indent, (UV)count, start);
8349         }
8350         count += 2;
8351     }
8352 }
8353 #endif
8354
8355 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8356 bool
8357 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8358 {
8359     /* Return a boolean as to if the two passed in inversion lists are
8360      * identical.  The final argument, if TRUE, says to take the complement of
8361      * the second inversion list before doing the comparison */
8362
8363     const UV* array_a = invlist_array(a);
8364     const UV* array_b = invlist_array(b);
8365     UV len_a = _invlist_len(a);
8366     UV len_b = _invlist_len(b);
8367
8368     UV i = 0;               /* current index into the arrays */
8369     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8370
8371     PERL_ARGS_ASSERT__INVLISTEQ;
8372
8373     /* If are to compare 'a' with the complement of b, set it
8374      * up so are looking at b's complement. */
8375     if (complement_b) {
8376
8377         /* The complement of nothing is everything, so <a> would have to have
8378          * just one element, starting at zero (ending at infinity) */
8379         if (len_b == 0) {
8380             return (len_a == 1 && array_a[0] == 0);
8381         }
8382         else if (array_b[0] == 0) {
8383
8384             /* Otherwise, to complement, we invert.  Here, the first element is
8385              * 0, just remove it.  To do this, we just pretend the array starts
8386              * one later */
8387
8388             array_b++;
8389             len_b--;
8390         }
8391         else {
8392
8393             /* But if the first element is not zero, we pretend the list starts
8394              * at the 0 that is always stored immediately before the array. */
8395             array_b--;
8396             len_b++;
8397         }
8398     }
8399
8400     /* Make sure that the lengths are the same, as well as the final element
8401      * before looping through the remainder.  (Thus we test the length, final,
8402      * and first elements right off the bat) */
8403     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8404         retval = FALSE;
8405     }
8406     else for (i = 0; i < len_a - 1; i++) {
8407         if (array_a[i] != array_b[i]) {
8408             retval = FALSE;
8409             break;
8410         }
8411     }
8412
8413     return retval;
8414 }
8415 #endif
8416
8417 #undef HEADER_LENGTH
8418 #undef TO_INTERNAL_SIZE
8419 #undef FROM_INTERNAL_SIZE
8420 #undef INVLIST_VERSION_ID
8421
8422 /* End of inversion list object */
8423
8424 STATIC void
8425 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8426 {
8427     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8428      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8429      * should point to the first flag; it is updated on output to point to the
8430      * final ')' or ':'.  There needs to be at least one flag, or this will
8431      * abort */
8432
8433     /* for (?g), (?gc), and (?o) warnings; warning
8434        about (?c) will warn about (?g) -- japhy    */
8435
8436 #define WASTED_O  0x01
8437 #define WASTED_G  0x02
8438 #define WASTED_C  0x04
8439 #define WASTED_GC (WASTED_G|WASTED_C)
8440     I32 wastedflags = 0x00;
8441     U32 posflags = 0, negflags = 0;
8442     U32 *flagsp = &posflags;
8443     char has_charset_modifier = '\0';
8444     regex_charset cs;
8445     bool has_use_defaults = FALSE;
8446     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8447
8448     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8449
8450     /* '^' as an initial flag sets certain defaults */
8451     if (UCHARAT(RExC_parse) == '^') {
8452         RExC_parse++;
8453         has_use_defaults = TRUE;
8454         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8455         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8456                                         ? REGEX_UNICODE_CHARSET
8457                                         : REGEX_DEPENDS_CHARSET);
8458     }
8459
8460     cs = get_regex_charset(RExC_flags);
8461     if (cs == REGEX_DEPENDS_CHARSET
8462         && (RExC_utf8 || RExC_uni_semantics))
8463     {
8464         cs = REGEX_UNICODE_CHARSET;
8465     }
8466
8467     while (*RExC_parse) {
8468         /* && strchr("iogcmsx", *RExC_parse) */
8469         /* (?g), (?gc) and (?o) are useless here
8470            and must be globally applied -- japhy */
8471         switch (*RExC_parse) {
8472
8473             /* Code for the imsx flags */
8474             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8475
8476             case LOCALE_PAT_MOD:
8477                 if (has_charset_modifier) {
8478                     goto excess_modifier;
8479                 }
8480                 else if (flagsp == &negflags) {
8481                     goto neg_modifier;
8482                 }
8483                 cs = REGEX_LOCALE_CHARSET;
8484                 has_charset_modifier = LOCALE_PAT_MOD;
8485                 RExC_contains_locale = 1;
8486                 break;
8487             case UNICODE_PAT_MOD:
8488                 if (has_charset_modifier) {
8489                     goto excess_modifier;
8490                 }
8491                 else if (flagsp == &negflags) {
8492                     goto neg_modifier;
8493                 }
8494                 cs = REGEX_UNICODE_CHARSET;
8495                 has_charset_modifier = UNICODE_PAT_MOD;
8496                 break;
8497             case ASCII_RESTRICT_PAT_MOD:
8498                 if (flagsp == &negflags) {
8499                     goto neg_modifier;
8500                 }
8501                 if (has_charset_modifier) {
8502                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8503                         goto excess_modifier;
8504                     }
8505                     /* Doubled modifier implies more restricted */
8506                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8507                 }
8508                 else {
8509                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8510                 }
8511                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8512                 break;
8513             case DEPENDS_PAT_MOD:
8514                 if (has_use_defaults) {
8515                     goto fail_modifiers;
8516                 }
8517                 else if (flagsp == &negflags) {
8518                     goto neg_modifier;
8519                 }
8520                 else if (has_charset_modifier) {
8521                     goto excess_modifier;
8522                 }
8523
8524                 /* The dual charset means unicode semantics if the
8525                  * pattern (or target, not known until runtime) are
8526                  * utf8, or something in the pattern indicates unicode
8527                  * semantics */
8528                 cs = (RExC_utf8 || RExC_uni_semantics)
8529                      ? REGEX_UNICODE_CHARSET
8530                      : REGEX_DEPENDS_CHARSET;
8531                 has_charset_modifier = DEPENDS_PAT_MOD;
8532                 break;
8533             excess_modifier:
8534                 RExC_parse++;
8535                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8536                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8537                 }
8538                 else if (has_charset_modifier == *(RExC_parse - 1)) {
8539                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8540                 }
8541                 else {
8542                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8543                 }
8544                 /*NOTREACHED*/
8545             neg_modifier:
8546                 RExC_parse++;
8547                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8548                 /*NOTREACHED*/
8549             case ONCE_PAT_MOD: /* 'o' */
8550             case GLOBAL_PAT_MOD: /* 'g' */
8551                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8552                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8553                     if (! (wastedflags & wflagbit) ) {
8554                         wastedflags |= wflagbit;
8555                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8556                         vWARN5(
8557                             RExC_parse + 1,
8558                             "Useless (%s%c) - %suse /%c modifier",
8559                             flagsp == &negflags ? "?-" : "?",
8560                             *RExC_parse,
8561                             flagsp == &negflags ? "don't " : "",
8562                             *RExC_parse
8563                         );
8564                     }
8565                 }
8566                 break;
8567
8568             case CONTINUE_PAT_MOD: /* 'c' */
8569                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8570                     if (! (wastedflags & WASTED_C) ) {
8571                         wastedflags |= WASTED_GC;
8572                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8573                         vWARN3(
8574                             RExC_parse + 1,
8575                             "Useless (%sc) - %suse /gc modifier",
8576                             flagsp == &negflags ? "?-" : "?",
8577                             flagsp == &negflags ? "don't " : ""
8578                         );
8579                     }
8580                 }
8581                 break;
8582             case KEEPCOPY_PAT_MOD: /* 'p' */
8583                 if (flagsp == &negflags) {
8584                     if (SIZE_ONLY)
8585                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8586                 } else {
8587                     *flagsp |= RXf_PMf_KEEPCOPY;
8588                 }
8589                 break;
8590             case '-':
8591                 /* A flag is a default iff it is following a minus, so
8592                  * if there is a minus, it means will be trying to
8593                  * re-specify a default which is an error */
8594                 if (has_use_defaults || flagsp == &negflags) {
8595                     goto fail_modifiers;
8596                 }
8597                 flagsp = &negflags;
8598                 wastedflags = 0;  /* reset so (?g-c) warns twice */
8599                 break;
8600             case ':':
8601             case ')':
8602                 RExC_flags |= posflags;
8603                 RExC_flags &= ~negflags;
8604                 set_regex_charset(&RExC_flags, cs);
8605                 return;
8606                 /*NOTREACHED*/
8607             default:
8608             fail_modifiers:
8609                 RExC_parse++;
8610                 vFAIL3("Sequence (%.*s...) not recognized",
8611                        RExC_parse-seqstart, seqstart);
8612                 /*NOTREACHED*/
8613         }
8614
8615         ++RExC_parse;
8616     }
8617 }
8618
8619 /*
8620  - reg - regular expression, i.e. main body or parenthesized thing
8621  *
8622  * Caller must absorb opening parenthesis.
8623  *
8624  * Combining parenthesis handling with the base level of regular expression
8625  * is a trifle forced, but the need to tie the tails of the branches to what
8626  * follows makes it hard to avoid.
8627  */
8628 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8629 #ifdef DEBUGGING
8630 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8631 #else
8632 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8633 #endif
8634
8635 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8636    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8637    needs to be restarted.
8638    Otherwise would only return NULL if regbranch() returns NULL, which
8639    cannot happen.  */
8640 STATIC regnode *
8641 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8642     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8643      * 2 is like 1, but indicates that nextchar() has been called to advance
8644      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
8645      * this flag alerts us to the need to check for that */
8646 {
8647     dVAR;
8648     regnode *ret;               /* Will be the head of the group. */
8649     regnode *br;
8650     regnode *lastbr;
8651     regnode *ender = NULL;
8652     I32 parno = 0;
8653     I32 flags;
8654     U32 oregflags = RExC_flags;
8655     bool have_branch = 0;
8656     bool is_open = 0;
8657     I32 freeze_paren = 0;
8658     I32 after_freeze = 0;
8659
8660     char * parse_start = RExC_parse; /* MJD */
8661     char * const oregcomp_parse = RExC_parse;
8662
8663     GET_RE_DEBUG_FLAGS_DECL;
8664
8665     PERL_ARGS_ASSERT_REG;
8666     DEBUG_PARSE("reg ");
8667
8668     *flagp = 0;                         /* Tentatively. */
8669
8670
8671     /* Make an OPEN node, if parenthesized. */
8672     if (paren) {
8673
8674         /* Under /x, space and comments can be gobbled up between the '(' and
8675          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
8676          * intervening space, as the sequence is a token, and a token should be
8677          * indivisible */
8678         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8679
8680         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8681             char *start_verb = RExC_parse;
8682             STRLEN verb_len = 0;
8683             char *start_arg = NULL;
8684             unsigned char op = 0;
8685             int argok = 1;
8686             int internal_argval = 0; /* internal_argval is only useful if !argok */
8687
8688             if (has_intervening_patws && SIZE_ONLY) {
8689                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8690             }
8691             while ( *RExC_parse && *RExC_parse != ')' ) {
8692                 if ( *RExC_parse == ':' ) {
8693                     start_arg = RExC_parse + 1;
8694                     break;
8695                 }
8696                 RExC_parse++;
8697             }
8698             ++start_verb;
8699             verb_len = RExC_parse - start_verb;
8700             if ( start_arg ) {
8701                 RExC_parse++;
8702                 while ( *RExC_parse && *RExC_parse != ')' )
8703                     RExC_parse++;
8704                 if ( *RExC_parse != ')' )
8705                     vFAIL("Unterminated verb pattern argument");
8706                 if ( RExC_parse == start_arg )
8707                     start_arg = NULL;
8708             } else {
8709                 if ( *RExC_parse != ')' )
8710                     vFAIL("Unterminated verb pattern");
8711             }
8712
8713             switch ( *start_verb ) {
8714             case 'A':  /* (*ACCEPT) */
8715                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8716                     op = ACCEPT;
8717                     internal_argval = RExC_nestroot;
8718                 }
8719                 break;
8720             case 'C':  /* (*COMMIT) */
8721                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8722                     op = COMMIT;
8723                 break;
8724             case 'F':  /* (*FAIL) */
8725                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8726                     op = OPFAIL;
8727                     argok = 0;
8728                 }
8729                 break;
8730             case ':':  /* (*:NAME) */
8731             case 'M':  /* (*MARK:NAME) */
8732                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8733                     op = MARKPOINT;
8734                     argok = -1;
8735                 }
8736                 break;
8737             case 'P':  /* (*PRUNE) */
8738                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8739                     op = PRUNE;
8740                 break;
8741             case 'S':   /* (*SKIP) */
8742                 if ( memEQs(start_verb,verb_len,"SKIP") )
8743                     op = SKIP;
8744                 break;
8745             case 'T':  /* (*THEN) */
8746                 /* [19:06] <TimToady> :: is then */
8747                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8748                     op = CUTGROUP;
8749                     RExC_seen |= REG_SEEN_CUTGROUP;
8750                 }
8751                 break;
8752             }
8753             if ( ! op ) {
8754                 RExC_parse++;
8755                 vFAIL3("Unknown verb pattern '%.*s'",
8756                     verb_len, start_verb);
8757             }
8758             if ( argok ) {
8759                 if ( start_arg && internal_argval ) {
8760                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8761                         verb_len, start_verb);
8762                 } else if ( argok < 0 && !start_arg ) {
8763                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8764                         verb_len, start_verb);
8765                 } else {
8766                     ret = reganode(pRExC_state, op, internal_argval);
8767                     if ( ! internal_argval && ! SIZE_ONLY ) {
8768                         if (start_arg) {
8769                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8770                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8771                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8772                             ret->flags = 0;
8773                         } else {
8774                             ret->flags = 1;
8775                         }
8776                     }
8777                 }
8778                 if (!internal_argval)
8779                     RExC_seen |= REG_SEEN_VERBARG;
8780             } else if ( start_arg ) {
8781                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8782                         verb_len, start_verb);
8783             } else {
8784                 ret = reg_node(pRExC_state, op);
8785             }
8786             nextchar(pRExC_state);
8787             return ret;
8788         }
8789         else if (*RExC_parse == '?') { /* (?...) */
8790             bool is_logical = 0;
8791             const char * const seqstart = RExC_parse;
8792             if (has_intervening_patws && SIZE_ONLY) {
8793                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8794             }
8795
8796             RExC_parse++;
8797             paren = *RExC_parse++;
8798             ret = NULL;                 /* For look-ahead/behind. */
8799             switch (paren) {
8800
8801             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8802                 paren = *RExC_parse++;
8803                 if ( paren == '<')         /* (?P<...>) named capture */
8804                     goto named_capture;
8805                 else if (paren == '>') {   /* (?P>name) named recursion */
8806                     goto named_recursion;
8807                 }
8808                 else if (paren == '=') {   /* (?P=...)  named backref */
8809                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8810                        you change this make sure you change that */
8811                     char* name_start = RExC_parse;
8812                     U32 num = 0;
8813                     SV *sv_dat = reg_scan_name(pRExC_state,
8814                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8815                     if (RExC_parse == name_start || *RExC_parse != ')')
8816                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8817
8818                     if (!SIZE_ONLY) {
8819                         num = add_data( pRExC_state, 1, "S" );
8820                         RExC_rxi->data->data[num]=(void*)sv_dat;
8821                         SvREFCNT_inc_simple_void(sv_dat);
8822                     }
8823                     RExC_sawback = 1;
8824                     ret = reganode(pRExC_state,
8825                                    ((! FOLD)
8826                                      ? NREF
8827                                      : (ASCII_FOLD_RESTRICTED)
8828                                        ? NREFFA
8829                                        : (AT_LEAST_UNI_SEMANTICS)
8830                                          ? NREFFU
8831                                          : (LOC)
8832                                            ? NREFFL
8833                                            : NREFF),
8834                                     num);
8835                     *flagp |= HASWIDTH;
8836
8837                     Set_Node_Offset(ret, parse_start+1);
8838                     Set_Node_Cur_Length(ret, parse_start);
8839
8840                     nextchar(pRExC_state);
8841                     return ret;
8842                 }
8843                 RExC_parse++;
8844                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8845                 /*NOTREACHED*/
8846             case '<':           /* (?<...) */
8847                 if (*RExC_parse == '!')
8848                     paren = ',';
8849                 else if (*RExC_parse != '=')
8850               named_capture:
8851                 {               /* (?<...>) */
8852                     char *name_start;
8853                     SV *svname;
8854                     paren= '>';
8855             case '\'':          /* (?'...') */
8856                     name_start= RExC_parse;
8857                     svname = reg_scan_name(pRExC_state,
8858                         SIZE_ONLY ?  /* reverse test from the others */
8859                         REG_RSN_RETURN_NAME :
8860                         REG_RSN_RETURN_NULL);
8861                     if (RExC_parse == name_start) {
8862                         RExC_parse++;
8863                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8864                         /*NOTREACHED*/
8865                     }
8866                     if (*RExC_parse != paren)
8867                         vFAIL2("Sequence (?%c... not terminated",
8868                             paren=='>' ? '<' : paren);
8869                     if (SIZE_ONLY) {
8870                         HE *he_str;
8871                         SV *sv_dat = NULL;
8872                         if (!svname) /* shouldn't happen */
8873                             Perl_croak(aTHX_
8874                                 "panic: reg_scan_name returned NULL");
8875                         if (!RExC_paren_names) {
8876                             RExC_paren_names= newHV();
8877                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8878 #ifdef DEBUGGING
8879                             RExC_paren_name_list= newAV();
8880                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8881 #endif
8882                         }
8883                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8884                         if ( he_str )
8885                             sv_dat = HeVAL(he_str);
8886                         if ( ! sv_dat ) {
8887                             /* croak baby croak */
8888                             Perl_croak(aTHX_
8889                                 "panic: paren_name hash element allocation failed");
8890                         } else if ( SvPOK(sv_dat) ) {
8891                             /* (?|...) can mean we have dupes so scan to check
8892                                its already been stored. Maybe a flag indicating
8893                                we are inside such a construct would be useful,
8894                                but the arrays are likely to be quite small, so
8895                                for now we punt -- dmq */
8896                             IV count = SvIV(sv_dat);
8897                             I32 *pv = (I32*)SvPVX(sv_dat);
8898                             IV i;
8899                             for ( i = 0 ; i < count ; i++ ) {
8900                                 if ( pv[i] == RExC_npar ) {
8901                                     count = 0;
8902                                     break;
8903                                 }
8904                             }
8905                             if ( count ) {
8906                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8907                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8908                                 pv[count] = RExC_npar;
8909                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8910                             }
8911                         } else {
8912                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8913                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8914                             SvIOK_on(sv_dat);
8915                             SvIV_set(sv_dat, 1);
8916                         }
8917 #ifdef DEBUGGING
8918                         /* Yes this does cause a memory leak in debugging Perls */
8919                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8920                             SvREFCNT_dec_NN(svname);
8921 #endif
8922
8923                         /*sv_dump(sv_dat);*/
8924                     }
8925                     nextchar(pRExC_state);
8926                     paren = 1;
8927                     goto capturing_parens;
8928                 }
8929                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8930                 RExC_in_lookbehind++;
8931                 RExC_parse++;
8932             case '=':           /* (?=...) */
8933                 RExC_seen_zerolen++;
8934                 break;
8935             case '!':           /* (?!...) */
8936                 RExC_seen_zerolen++;
8937                 if (*RExC_parse == ')') {
8938                     ret=reg_node(pRExC_state, OPFAIL);
8939                     nextchar(pRExC_state);
8940                     return ret;
8941                 }
8942                 break;
8943             case '|':           /* (?|...) */
8944                 /* branch reset, behave like a (?:...) except that
8945                    buffers in alternations share the same numbers */
8946                 paren = ':';
8947                 after_freeze = freeze_paren = RExC_npar;
8948                 break;
8949             case ':':           /* (?:...) */
8950             case '>':           /* (?>...) */
8951                 break;
8952             case '$':           /* (?$...) */
8953             case '@':           /* (?@...) */
8954                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8955                 break;
8956             case '#':           /* (?#...) */
8957                 /* XXX As soon as we disallow separating the '?' and '*' (by
8958                  * spaces or (?#...) comment), it is believed that this case
8959                  * will be unreachable and can be removed.  See
8960                  * [perl #117327] */
8961                 while (*RExC_parse && *RExC_parse != ')')
8962                     RExC_parse++;
8963                 if (*RExC_parse != ')')
8964                     FAIL("Sequence (?#... not terminated");
8965                 nextchar(pRExC_state);
8966                 *flagp = TRYAGAIN;
8967                 return NULL;
8968             case '0' :           /* (?0) */
8969             case 'R' :           /* (?R) */
8970                 if (*RExC_parse != ')')
8971                     FAIL("Sequence (?R) not terminated");
8972                 ret = reg_node(pRExC_state, GOSTART);
8973                 *flagp |= POSTPONED;
8974                 nextchar(pRExC_state);
8975                 return ret;
8976                 /*notreached*/
8977             { /* named and numeric backreferences */
8978                 I32 num;
8979             case '&':            /* (?&NAME) */
8980                 parse_start = RExC_parse - 1;
8981               named_recursion:
8982                 {
8983                     SV *sv_dat = reg_scan_name(pRExC_state,
8984                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8985                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8986                 }
8987                 goto gen_recurse_regop;
8988                 assert(0); /* NOT REACHED */
8989             case '+':
8990                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8991                     RExC_parse++;
8992                     vFAIL("Illegal pattern");
8993                 }
8994                 goto parse_recursion;
8995                 /* NOT REACHED*/
8996             case '-': /* (?-1) */
8997                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8998                     RExC_parse--; /* rewind to let it be handled later */
8999                     goto parse_flags;
9000                 }
9001                 /*FALLTHROUGH */
9002             case '1': case '2': case '3': case '4': /* (?1) */
9003             case '5': case '6': case '7': case '8': case '9':
9004                 RExC_parse--;
9005               parse_recursion:
9006                 num = atoi(RExC_parse);
9007                 parse_start = RExC_parse - 1; /* MJD */
9008                 if (*RExC_parse == '-')
9009                     RExC_parse++;
9010                 while (isDIGIT(*RExC_parse))
9011                         RExC_parse++;
9012                 if (*RExC_parse!=')')
9013                     vFAIL("Expecting close bracket");
9014
9015               gen_recurse_regop:
9016                 if ( paren == '-' ) {
9017                     /*
9018                     Diagram of capture buffer numbering.
9019                     Top line is the normal capture buffer numbers
9020                     Bottom line is the negative indexing as from
9021                     the X (the (?-2))
9022
9023                     +   1 2    3 4 5 X          6 7
9024                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9025                     -   5 4    3 2 1 X          x x
9026
9027                     */
9028                     num = RExC_npar + num;
9029                     if (num < 1)  {
9030                         RExC_parse++;
9031                         vFAIL("Reference to nonexistent group");
9032                     }
9033                 } else if ( paren == '+' ) {
9034                     num = RExC_npar + num - 1;
9035                 }
9036
9037                 ret = reganode(pRExC_state, GOSUB, num);
9038                 if (!SIZE_ONLY) {
9039                     if (num > (I32)RExC_rx->nparens) {
9040                         RExC_parse++;
9041                         vFAIL("Reference to nonexistent group");
9042                     }
9043                     ARG2L_SET( ret, RExC_recurse_count++);
9044                     RExC_emit++;
9045                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9046                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9047                 } else {
9048                     RExC_size++;
9049                 }
9050                 RExC_seen |= REG_SEEN_RECURSE;
9051                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9052                 Set_Node_Offset(ret, parse_start); /* MJD */
9053
9054                 *flagp |= POSTPONED;
9055                 nextchar(pRExC_state);
9056                 return ret;
9057             } /* named and numeric backreferences */
9058             assert(0); /* NOT REACHED */
9059
9060             case '?':           /* (??...) */
9061                 is_logical = 1;
9062                 if (*RExC_parse != '{') {
9063                     RExC_parse++;
9064                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9065                     /*NOTREACHED*/
9066                 }
9067                 *flagp |= POSTPONED;
9068                 paren = *RExC_parse++;
9069                 /* FALL THROUGH */
9070             case '{':           /* (?{...}) */
9071             {
9072                 U32 n = 0;
9073                 struct reg_code_block *cb;
9074
9075                 RExC_seen_zerolen++;
9076
9077                 if (   !pRExC_state->num_code_blocks
9078                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9079                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9080                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9081                             - RExC_start)
9082                 ) {
9083                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9084                         FAIL("panic: Sequence (?{...}): no code block found\n");
9085                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9086                 }
9087                 /* this is a pre-compiled code block (?{...}) */
9088                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9089                 RExC_parse = RExC_start + cb->end;
9090                 if (!SIZE_ONLY) {
9091                     OP *o = cb->block;
9092                     if (cb->src_regex) {
9093                         n = add_data(pRExC_state, 2, "rl");
9094                         RExC_rxi->data->data[n] =
9095                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9096                         RExC_rxi->data->data[n+1] = (void*)o;
9097                     }
9098                     else {
9099                         n = add_data(pRExC_state, 1,
9100                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9101                         RExC_rxi->data->data[n] = (void*)o;
9102                     }
9103                 }
9104                 pRExC_state->code_index++;
9105                 nextchar(pRExC_state);
9106
9107                 if (is_logical) {
9108                     regnode *eval;
9109                     ret = reg_node(pRExC_state, LOGICAL);
9110                     eval = reganode(pRExC_state, EVAL, n);
9111                     if (!SIZE_ONLY) {
9112                         ret->flags = 2;
9113                         /* for later propagation into (??{}) return value */
9114                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9115                     }
9116                     REGTAIL(pRExC_state, ret, eval);
9117                     /* deal with the length of this later - MJD */
9118                     return ret;
9119                 }
9120                 ret = reganode(pRExC_state, EVAL, n);
9121                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9122                 Set_Node_Offset(ret, parse_start);
9123                 return ret;
9124             }
9125             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9126             {
9127                 int is_define= 0;
9128                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9129                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9130                         || RExC_parse[1] == '<'
9131                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9132                         I32 flag;
9133                         regnode *tail;
9134
9135                         ret = reg_node(pRExC_state, LOGICAL);
9136                         if (!SIZE_ONLY)
9137                             ret->flags = 1;
9138
9139                         tail = reg(pRExC_state, 1, &flag, depth+1);
9140                         if (flag & RESTART_UTF8) {
9141                             *flagp = RESTART_UTF8;
9142                             return NULL;
9143                         }
9144                         REGTAIL(pRExC_state, ret, tail);
9145                         goto insert_if;
9146                     }
9147                 }
9148                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9149                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9150                 {
9151                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9152                     char *name_start= RExC_parse++;
9153                     U32 num = 0;
9154                     SV *sv_dat=reg_scan_name(pRExC_state,
9155                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9156                     if (RExC_parse == name_start || *RExC_parse != ch)
9157                         vFAIL2("Sequence (?(%c... not terminated",
9158                             (ch == '>' ? '<' : ch));
9159                     RExC_parse++;
9160                     if (!SIZE_ONLY) {
9161                         num = add_data( pRExC_state, 1, "S" );
9162                         RExC_rxi->data->data[num]=(void*)sv_dat;
9163                         SvREFCNT_inc_simple_void(sv_dat);
9164                     }
9165                     ret = reganode(pRExC_state,NGROUPP,num);
9166                     goto insert_if_check_paren;
9167                 }
9168                 else if (RExC_parse[0] == 'D' &&
9169                          RExC_parse[1] == 'E' &&
9170                          RExC_parse[2] == 'F' &&
9171                          RExC_parse[3] == 'I' &&
9172                          RExC_parse[4] == 'N' &&
9173                          RExC_parse[5] == 'E')
9174                 {
9175                     ret = reganode(pRExC_state,DEFINEP,0);
9176                     RExC_parse +=6 ;
9177                     is_define = 1;
9178                     goto insert_if_check_paren;
9179                 }
9180                 else if (RExC_parse[0] == 'R') {
9181                     RExC_parse++;
9182                     parno = 0;
9183                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9184                         parno = atoi(RExC_parse++);
9185                         while (isDIGIT(*RExC_parse))
9186                             RExC_parse++;
9187                     } else if (RExC_parse[0] == '&') {
9188                         SV *sv_dat;
9189                         RExC_parse++;
9190                         sv_dat = reg_scan_name(pRExC_state,
9191                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9192                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9193                     }
9194                     ret = reganode(pRExC_state,INSUBP,parno);
9195                     goto insert_if_check_paren;
9196                 }
9197                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9198                     /* (?(1)...) */
9199                     char c;
9200                     parno = atoi(RExC_parse++);
9201
9202                     while (isDIGIT(*RExC_parse))
9203                         RExC_parse++;
9204                     ret = reganode(pRExC_state, GROUPP, parno);
9205
9206                  insert_if_check_paren:
9207                     if ((c = *nextchar(pRExC_state)) != ')')
9208                         vFAIL("Switch condition not recognized");
9209                   insert_if:
9210                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9211                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9212                     if (br == NULL) {
9213                         if (flags & RESTART_UTF8) {
9214                             *flagp = RESTART_UTF8;
9215                             return NULL;
9216                         }
9217                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9218                               (UV) flags);
9219                     } else
9220                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9221                     c = *nextchar(pRExC_state);
9222                     if (flags&HASWIDTH)
9223                         *flagp |= HASWIDTH;
9224                     if (c == '|') {
9225                         if (is_define)
9226                             vFAIL("(?(DEFINE)....) does not allow branches");
9227                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9228                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9229                             if (flags & RESTART_UTF8) {
9230                                 *flagp = RESTART_UTF8;
9231                                 return NULL;
9232                             }
9233                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9234                                   (UV) flags);
9235                         }
9236                         REGTAIL(pRExC_state, ret, lastbr);
9237                         if (flags&HASWIDTH)
9238                             *flagp |= HASWIDTH;
9239                         c = *nextchar(pRExC_state);
9240                     }
9241                     else
9242                         lastbr = NULL;
9243                     if (c != ')')
9244                         vFAIL("Switch (?(condition)... contains too many branches");
9245                     ender = reg_node(pRExC_state, TAIL);
9246                     REGTAIL(pRExC_state, br, ender);
9247                     if (lastbr) {
9248                         REGTAIL(pRExC_state, lastbr, ender);
9249                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9250                     }
9251                     else
9252                         REGTAIL(pRExC_state, ret, ender);
9253                     RExC_size++; /* XXX WHY do we need this?!!
9254                                     For large programs it seems to be required
9255                                     but I can't figure out why. -- dmq*/
9256                     return ret;
9257                 }
9258                 else {
9259                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9260                 }
9261             }
9262             case '[':           /* (?[ ... ]) */
9263                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9264                                          oregcomp_parse);
9265             case 0:
9266                 RExC_parse--; /* for vFAIL to print correctly */
9267                 vFAIL("Sequence (? incomplete");
9268                 break;
9269             default: /* e.g., (?i) */
9270                 --RExC_parse;
9271               parse_flags:
9272                 parse_lparen_question_flags(pRExC_state);
9273                 if (UCHARAT(RExC_parse) != ':') {
9274                     nextchar(pRExC_state);
9275                     *flagp = TRYAGAIN;
9276                     return NULL;
9277                 }
9278                 paren = ':';
9279                 nextchar(pRExC_state);
9280                 ret = NULL;
9281                 goto parse_rest;
9282             } /* end switch */
9283         }
9284         else {                  /* (...) */
9285           capturing_parens:
9286             parno = RExC_npar;
9287             RExC_npar++;
9288
9289             ret = reganode(pRExC_state, OPEN, parno);
9290             if (!SIZE_ONLY ){
9291                 if (!RExC_nestroot)
9292                     RExC_nestroot = parno;
9293                 if (RExC_seen & REG_SEEN_RECURSE
9294                     && !RExC_open_parens[parno-1])
9295                 {
9296                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9297                         "Setting open paren #%"IVdf" to %d\n",
9298                         (IV)parno, REG_NODE_NUM(ret)));
9299                     RExC_open_parens[parno-1]= ret;
9300                 }
9301             }
9302             Set_Node_Length(ret, 1); /* MJD */
9303             Set_Node_Offset(ret, RExC_parse); /* MJD */
9304             is_open = 1;
9305         }
9306     }
9307     else                        /* ! paren */
9308         ret = NULL;
9309
9310    parse_rest:
9311     /* Pick up the branches, linking them together. */
9312     parse_start = RExC_parse;   /* MJD */
9313     br = regbranch(pRExC_state, &flags, 1,depth+1);
9314
9315     /*     branch_len = (paren != 0); */
9316
9317     if (br == NULL) {
9318         if (flags & RESTART_UTF8) {
9319             *flagp = RESTART_UTF8;
9320             return NULL;
9321         }
9322         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9323     }
9324     if (*RExC_parse == '|') {
9325         if (!SIZE_ONLY && RExC_extralen) {
9326             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9327         }
9328         else {                  /* MJD */
9329             reginsert(pRExC_state, BRANCH, br, depth+1);
9330             Set_Node_Length(br, paren != 0);
9331             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9332         }
9333         have_branch = 1;
9334         if (SIZE_ONLY)
9335             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9336     }
9337     else if (paren == ':') {
9338         *flagp |= flags&SIMPLE;
9339     }
9340     if (is_open) {                              /* Starts with OPEN. */
9341         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9342     }
9343     else if (paren != '?')              /* Not Conditional */
9344         ret = br;
9345     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9346     lastbr = br;
9347     while (*RExC_parse == '|') {
9348         if (!SIZE_ONLY && RExC_extralen) {
9349             ender = reganode(pRExC_state, LONGJMP,0);
9350             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9351         }
9352         if (SIZE_ONLY)
9353             RExC_extralen += 2;         /* Account for LONGJMP. */
9354         nextchar(pRExC_state);
9355         if (freeze_paren) {
9356             if (RExC_npar > after_freeze)
9357                 after_freeze = RExC_npar;
9358             RExC_npar = freeze_paren;
9359         }
9360         br = regbranch(pRExC_state, &flags, 0, depth+1);
9361
9362         if (br == NULL) {
9363             if (flags & RESTART_UTF8) {
9364                 *flagp = RESTART_UTF8;
9365                 return NULL;
9366             }
9367             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9368         }
9369         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9370         lastbr = br;
9371         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9372     }
9373
9374     if (have_branch || paren != ':') {
9375         /* Make a closing node, and hook it on the end. */
9376         switch (paren) {
9377         case ':':
9378             ender = reg_node(pRExC_state, TAIL);
9379             break;
9380         case 1: case 2:
9381             ender = reganode(pRExC_state, CLOSE, parno);
9382             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9383                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9384                         "Setting close paren #%"IVdf" to %d\n",
9385                         (IV)parno, REG_NODE_NUM(ender)));
9386                 RExC_close_parens[parno-1]= ender;
9387                 if (RExC_nestroot == parno)
9388                     RExC_nestroot = 0;
9389             }
9390             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9391             Set_Node_Length(ender,1); /* MJD */
9392             break;
9393         case '<':
9394         case ',':
9395         case '=':
9396         case '!':
9397             *flagp &= ~HASWIDTH;
9398             /* FALL THROUGH */
9399         case '>':
9400             ender = reg_node(pRExC_state, SUCCEED);
9401             break;
9402         case 0:
9403             ender = reg_node(pRExC_state, END);
9404             if (!SIZE_ONLY) {
9405                 assert(!RExC_opend); /* there can only be one! */
9406                 RExC_opend = ender;
9407             }
9408             break;
9409         }
9410         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9411             SV * const mysv_val1=sv_newmortal();
9412             SV * const mysv_val2=sv_newmortal();
9413             DEBUG_PARSE_MSG("lsbr");
9414             regprop(RExC_rx, mysv_val1, lastbr);
9415             regprop(RExC_rx, mysv_val2, ender);
9416             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9417                           SvPV_nolen_const(mysv_val1),
9418                           (IV)REG_NODE_NUM(lastbr),
9419                           SvPV_nolen_const(mysv_val2),
9420                           (IV)REG_NODE_NUM(ender),
9421                           (IV)(ender - lastbr)
9422             );
9423         });
9424         REGTAIL(pRExC_state, lastbr, ender);
9425
9426         if (have_branch && !SIZE_ONLY) {
9427             char is_nothing= 1;
9428             if (depth==1)
9429                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9430
9431             /* Hook the tails of the branches to the closing node. */
9432             for (br = ret; br; br = regnext(br)) {
9433                 const U8 op = PL_regkind[OP(br)];
9434                 if (op == BRANCH) {
9435                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9436                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9437                         is_nothing= 0;
9438                 }
9439                 else if (op == BRANCHJ) {
9440                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9441                     /* for now we always disable this optimisation * /
9442                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9443                     */
9444                         is_nothing= 0;
9445                 }
9446             }
9447             if (is_nothing) {
9448                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9449                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9450                     SV * const mysv_val1=sv_newmortal();
9451                     SV * const mysv_val2=sv_newmortal();
9452                     DEBUG_PARSE_MSG("NADA");
9453                     regprop(RExC_rx, mysv_val1, ret);
9454                     regprop(RExC_rx, mysv_val2, ender);
9455                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9456                                   SvPV_nolen_const(mysv_val1),
9457                                   (IV)REG_NODE_NUM(ret),
9458                                   SvPV_nolen_const(mysv_val2),
9459                                   (IV)REG_NODE_NUM(ender),
9460                                   (IV)(ender - ret)
9461                     );
9462                 });
9463                 OP(br)= NOTHING;
9464                 if (OP(ender) == TAIL) {
9465                     NEXT_OFF(br)= 0;
9466                     RExC_emit= br + 1;
9467                 } else {
9468                     regnode *opt;
9469                     for ( opt= br + 1; opt < ender ; opt++ )
9470                         OP(opt)= OPTIMIZED;
9471                     NEXT_OFF(br)= ender - br;
9472                 }
9473             }
9474         }
9475     }
9476
9477     {
9478         const char *p;
9479         static const char parens[] = "=!<,>";
9480
9481         if (paren && (p = strchr(parens, paren))) {
9482             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9483             int flag = (p - parens) > 1;
9484
9485             if (paren == '>')
9486                 node = SUSPEND, flag = 0;
9487             reginsert(pRExC_state, node,ret, depth+1);
9488             Set_Node_Cur_Length(ret, parse_start);
9489             Set_Node_Offset(ret, parse_start + 1);
9490             ret->flags = flag;
9491             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9492         }
9493     }
9494
9495     /* Check for proper termination. */
9496     if (paren) {
9497         /* restore original flags, but keep (?p) */
9498         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9499         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9500             RExC_parse = oregcomp_parse;
9501             vFAIL("Unmatched (");
9502         }
9503     }
9504     else if (!paren && RExC_parse < RExC_end) {
9505         if (*RExC_parse == ')') {
9506             RExC_parse++;
9507             vFAIL("Unmatched )");
9508         }
9509         else
9510             FAIL("Junk on end of regexp");      /* "Can't happen". */
9511         assert(0); /* NOTREACHED */
9512     }
9513
9514     if (RExC_in_lookbehind) {
9515         RExC_in_lookbehind--;
9516     }
9517     if (after_freeze > RExC_npar)
9518         RExC_npar = after_freeze;
9519     return(ret);
9520 }
9521
9522 /*
9523  - regbranch - one alternative of an | operator
9524  *
9525  * Implements the concatenation operator.
9526  *
9527  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9528  * restarted.
9529  */
9530 STATIC regnode *
9531 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9532 {
9533     dVAR;
9534     regnode *ret;
9535     regnode *chain = NULL;
9536     regnode *latest;
9537     I32 flags = 0, c = 0;
9538     GET_RE_DEBUG_FLAGS_DECL;
9539
9540     PERL_ARGS_ASSERT_REGBRANCH;
9541
9542     DEBUG_PARSE("brnc");
9543
9544     if (first)
9545         ret = NULL;
9546     else {
9547         if (!SIZE_ONLY && RExC_extralen)
9548             ret = reganode(pRExC_state, BRANCHJ,0);
9549         else {
9550             ret = reg_node(pRExC_state, BRANCH);
9551             Set_Node_Length(ret, 1);
9552         }
9553     }
9554
9555     if (!first && SIZE_ONLY)
9556         RExC_extralen += 1;                     /* BRANCHJ */
9557
9558     *flagp = WORST;                     /* Tentatively. */
9559
9560     RExC_parse--;
9561     nextchar(pRExC_state);
9562     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9563         flags &= ~TRYAGAIN;
9564         latest = regpiece(pRExC_state, &flags,depth+1);
9565         if (latest == NULL) {
9566             if (flags & TRYAGAIN)
9567                 continue;
9568             if (flags & RESTART_UTF8) {
9569                 *flagp = RESTART_UTF8;
9570                 return NULL;
9571             }
9572             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9573         }
9574         else if (ret == NULL)
9575             ret = latest;
9576         *flagp |= flags&(HASWIDTH|POSTPONED);
9577         if (chain == NULL)      /* First piece. */
9578             *flagp |= flags&SPSTART;
9579         else {
9580             RExC_naughty++;
9581             REGTAIL(pRExC_state, chain, latest);
9582         }
9583         chain = latest;
9584         c++;
9585     }
9586     if (chain == NULL) {        /* Loop ran zero times. */
9587         chain = reg_node(pRExC_state, NOTHING);
9588         if (ret == NULL)
9589             ret = chain;
9590     }
9591     if (c == 1) {
9592         *flagp |= flags&SIMPLE;
9593     }
9594
9595     return ret;
9596 }
9597
9598 /*
9599  - regpiece - something followed by possible [*+?]
9600  *
9601  * Note that the branching code sequences used for ? and the general cases
9602  * of * and + are somewhat optimized:  they use the same NOTHING node as
9603  * both the endmarker for their branch list and the body of the last branch.
9604  * It might seem that this node could be dispensed with entirely, but the
9605  * endmarker role is not redundant.
9606  *
9607  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9608  * TRYAGAIN.
9609  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9610  * restarted.
9611  */
9612 STATIC regnode *
9613 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9614 {
9615     dVAR;
9616     regnode *ret;
9617     char op;
9618     char *next;
9619     I32 flags;
9620     const char * const origparse = RExC_parse;
9621     I32 min;
9622     I32 max = REG_INFTY;
9623 #ifdef RE_TRACK_PATTERN_OFFSETS
9624     char *parse_start;
9625 #endif
9626     const char *maxpos = NULL;
9627
9628     /* Save the original in case we change the emitted regop to a FAIL. */
9629     regnode * const orig_emit = RExC_emit;
9630
9631     GET_RE_DEBUG_FLAGS_DECL;
9632
9633     PERL_ARGS_ASSERT_REGPIECE;
9634
9635     DEBUG_PARSE("piec");
9636
9637     ret = regatom(pRExC_state, &flags,depth+1);
9638     if (ret == NULL) {
9639         if (flags & (TRYAGAIN|RESTART_UTF8))
9640             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9641         else
9642             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
9643         return(NULL);
9644     }
9645
9646     op = *RExC_parse;
9647
9648     if (op == '{' && regcurly(RExC_parse, FALSE)) {
9649         maxpos = NULL;
9650 #ifdef RE_TRACK_PATTERN_OFFSETS
9651         parse_start = RExC_parse; /* MJD */
9652 #endif
9653         next = RExC_parse + 1;
9654         while (isDIGIT(*next) || *next == ',') {
9655             if (*next == ',') {
9656                 if (maxpos)
9657                     break;
9658                 else
9659                     maxpos = next;
9660             }
9661             next++;
9662         }
9663         if (*next == '}') {             /* got one */
9664             if (!maxpos)
9665                 maxpos = next;
9666             RExC_parse++;
9667             min = atoi(RExC_parse);
9668             if (*maxpos == ',')
9669                 maxpos++;
9670             else
9671                 maxpos = RExC_parse;
9672             max = atoi(maxpos);
9673             if (!max && *maxpos != '0')
9674                 max = REG_INFTY;                /* meaning "infinity" */
9675             else if (max >= REG_INFTY)
9676                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9677             RExC_parse = next;
9678             nextchar(pRExC_state);
9679             if (max < min) {    /* If can't match, warn and optimize to fail
9680                                    unconditionally */
9681                 if (SIZE_ONLY) {
9682                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9683
9684                     /* We can't back off the size because we have to reserve
9685                      * enough space for all the things we are about to throw
9686                      * away, but we can shrink it by the ammount we are about
9687                      * to re-use here */
9688                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9689                 }
9690                 else {
9691                     RExC_emit = orig_emit;
9692                 }
9693                 ret = reg_node(pRExC_state, OPFAIL);
9694                 return ret;
9695             }
9696
9697         do_curly:
9698             if ((flags&SIMPLE)) {
9699                 RExC_naughty += 2 + RExC_naughty / 2;
9700                 reginsert(pRExC_state, CURLY, ret, depth+1);
9701                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9702                 Set_Node_Cur_Length(ret, parse_start);
9703             }
9704             else {
9705                 regnode * const w = reg_node(pRExC_state, WHILEM);
9706
9707                 w->flags = 0;
9708                 REGTAIL(pRExC_state, ret, w);
9709                 if (!SIZE_ONLY && RExC_extralen) {
9710                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9711                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9712                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9713                 }
9714                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9715                                 /* MJD hk */
9716                 Set_Node_Offset(ret, parse_start+1);
9717                 Set_Node_Length(ret,
9718                                 op == '{' ? (RExC_parse - parse_start) : 1);
9719
9720                 if (!SIZE_ONLY && RExC_extralen)
9721                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9722                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9723                 if (SIZE_ONLY)
9724                     RExC_whilem_seen++, RExC_extralen += 3;
9725                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9726             }
9727             ret->flags = 0;
9728
9729             if (min > 0)
9730                 *flagp = WORST;
9731             if (max > 0)
9732                 *flagp |= HASWIDTH;
9733             if (!SIZE_ONLY) {
9734                 ARG1_SET(ret, (U16)min);
9735                 ARG2_SET(ret, (U16)max);
9736             }
9737
9738             goto nest_check;
9739         }
9740     }
9741
9742     if (!ISMULT1(op)) {
9743         *flagp = flags;
9744         return(ret);
9745     }
9746
9747 #if 0                           /* Now runtime fix should be reliable. */
9748
9749     /* if this is reinstated, don't forget to put this back into perldiag:
9750
9751             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9752
9753            (F) The part of the regexp subject to either the * or + quantifier
9754            could match an empty string. The {#} shows in the regular
9755            expression about where the problem was discovered.
9756
9757     */
9758
9759     if (!(flags&HASWIDTH) && op != '?')
9760       vFAIL("Regexp *+ operand could be empty");
9761 #endif
9762
9763 #ifdef RE_TRACK_PATTERN_OFFSETS
9764     parse_start = RExC_parse;
9765 #endif
9766     nextchar(pRExC_state);
9767
9768     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9769
9770     if (op == '*' && (flags&SIMPLE)) {
9771         reginsert(pRExC_state, STAR, ret, depth+1);
9772         ret->flags = 0;
9773         RExC_naughty += 4;
9774     }
9775     else if (op == '*') {
9776         min = 0;
9777         goto do_curly;
9778     }
9779     else if (op == '+' && (flags&SIMPLE)) {
9780         reginsert(pRExC_state, PLUS, ret, depth+1);
9781         ret->flags = 0;
9782         RExC_naughty += 3;
9783     }
9784     else if (op == '+') {
9785         min = 1;
9786         goto do_curly;
9787     }
9788     else if (op == '?') {
9789         min = 0; max = 1;
9790         goto do_curly;
9791     }
9792   nest_check:
9793     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9794         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9795         ckWARN3reg(RExC_parse,
9796                    "%.*s matches null string many times",
9797                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9798                    origparse);
9799         (void)ReREFCNT_inc(RExC_rx_sv);
9800     }
9801
9802     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9803         nextchar(pRExC_state);
9804         reginsert(pRExC_state, MINMOD, ret, depth+1);
9805         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9806     }
9807     else
9808     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9809         regnode *ender;
9810         nextchar(pRExC_state);
9811         ender = reg_node(pRExC_state, SUCCEED);
9812         REGTAIL(pRExC_state, ret, ender);
9813         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9814         ret->flags = 0;
9815         ender = reg_node(pRExC_state, TAIL);
9816         REGTAIL(pRExC_state, ret, ender);
9817     }
9818
9819     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9820         RExC_parse++;
9821         vFAIL("Nested quantifiers");
9822     }
9823
9824     return(ret);
9825 }
9826
9827 STATIC bool
9828 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9829         const bool strict   /* Apply stricter parsing rules? */
9830     )
9831 {
9832
9833  /* This is expected to be called by a parser routine that has recognized '\N'
9834    and needs to handle the rest. RExC_parse is expected to point at the first
9835    char following the N at the time of the call.  On successful return,
9836    RExC_parse has been updated to point to just after the sequence identified
9837    by this routine, and <*flagp> has been updated.
9838
9839    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9840    character class.
9841
9842    \N may begin either a named sequence, or if outside a character class, mean
9843    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9844    attempted to decide which, and in the case of a named sequence, converted it
9845    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9846    where c1... are the characters in the sequence.  For single-quoted regexes,
9847    the tokenizer passes the \N sequence through unchanged; this code will not
9848    attempt to determine this nor expand those, instead raising a syntax error.
9849    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9850    or there is no '}', it signals that this \N occurrence means to match a
9851    non-newline.
9852
9853    Only the \N{U+...} form should occur in a character class, for the same
9854    reason that '.' inside a character class means to just match a period: it
9855    just doesn't make sense.
9856
9857    The function raises an error (via vFAIL), and doesn't return for various
9858    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9859    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9860    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9861    only possible if node_p is non-NULL.
9862
9863
9864    If <valuep> is non-null, it means the caller can accept an input sequence
9865    consisting of a just a single code point; <*valuep> is set to that value
9866    if the input is such.
9867
9868    If <node_p> is non-null it signifies that the caller can accept any other
9869    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9870    is set as follows:
9871     1) \N means not-a-NL: points to a newly created REG_ANY node;
9872     2) \N{}:              points to a new NOTHING node;
9873     3) otherwise:         points to a new EXACT node containing the resolved
9874                           string.
9875    Note that FALSE is returned for single code point sequences if <valuep> is
9876    null.
9877  */
9878
9879     char * endbrace;    /* '}' following the name */
9880     char* p;
9881     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9882                            stream */
9883     bool has_multiple_chars; /* true if the input stream contains a sequence of
9884                                 more than one character */
9885
9886     GET_RE_DEBUG_FLAGS_DECL;
9887
9888     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9889
9890     GET_RE_DEBUG_FLAGS;
9891
9892     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9893
9894     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9895      * modifier.  The other meaning does not */
9896     p = (RExC_flags & RXf_PMf_EXTENDED)
9897         ? regwhite( pRExC_state, RExC_parse )
9898         : RExC_parse;
9899
9900     /* Disambiguate between \N meaning a named character versus \N meaning
9901      * [^\n].  The former is assumed when it can't be the latter. */
9902     if (*p != '{' || regcurly(p, FALSE)) {
9903         RExC_parse = p;
9904         if (! node_p) {
9905             /* no bare \N in a charclass */
9906             if (in_char_class) {
9907                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9908             }
9909             return FALSE;
9910         }
9911         nextchar(pRExC_state);
9912         *node_p = reg_node(pRExC_state, REG_ANY);
9913         *flagp |= HASWIDTH|SIMPLE;
9914         RExC_naughty++;
9915         RExC_parse--;
9916         Set_Node_Length(*node_p, 1); /* MJD */
9917         return TRUE;
9918     }
9919
9920     /* Here, we have decided it should be a named character or sequence */
9921
9922     /* The test above made sure that the next real character is a '{', but
9923      * under the /x modifier, it could be separated by space (or a comment and
9924      * \n) and this is not allowed (for consistency with \x{...} and the
9925      * tokenizer handling of \N{NAME}). */
9926     if (*RExC_parse != '{') {
9927         vFAIL("Missing braces on \\N{}");
9928     }
9929
9930     RExC_parse++;       /* Skip past the '{' */
9931
9932     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9933         || ! (endbrace == RExC_parse            /* nothing between the {} */
9934               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9935                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9936     {
9937         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9938         vFAIL("\\N{NAME} must be resolved by the lexer");
9939     }
9940
9941     if (endbrace == RExC_parse) {   /* empty: \N{} */
9942         bool ret = TRUE;
9943         if (node_p) {
9944             *node_p = reg_node(pRExC_state,NOTHING);
9945         }
9946         else if (in_char_class) {
9947             if (SIZE_ONLY && in_char_class) {
9948                 if (strict) {
9949                     RExC_parse++;   /* Position after the "}" */
9950                     vFAIL("Zero length \\N{}");
9951                 }
9952                 else {
9953                     ckWARNreg(RExC_parse,
9954                               "Ignoring zero length \\N{} in character class");
9955                 }
9956             }
9957             ret = FALSE;
9958         }
9959         else {
9960             return FALSE;
9961         }
9962         nextchar(pRExC_state);
9963         return ret;
9964     }
9965
9966     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9967     RExC_parse += 2;    /* Skip past the 'U+' */
9968
9969     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9970
9971     /* Code points are separated by dots.  If none, there is only one code
9972      * point, and is terminated by the brace */
9973     has_multiple_chars = (endchar < endbrace);
9974
9975     if (valuep && (! has_multiple_chars || in_char_class)) {
9976         /* We only pay attention to the first char of
9977         multichar strings being returned in char classes. I kinda wonder
9978         if this makes sense as it does change the behaviour
9979         from earlier versions, OTOH that behaviour was broken
9980         as well. XXX Solution is to recharacterize as
9981         [rest-of-class]|multi1|multi2... */
9982
9983         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9984         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9985             | PERL_SCAN_DISALLOW_PREFIX
9986             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9987
9988         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9989
9990         /* The tokenizer should have guaranteed validity, but it's possible to
9991          * bypass it by using single quoting, so check */
9992         if (length_of_hex == 0
9993             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9994         {
9995             RExC_parse += length_of_hex;        /* Includes all the valid */
9996             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9997                             ? UTF8SKIP(RExC_parse)
9998                             : 1;
9999             /* Guard against malformed utf8 */
10000             if (RExC_parse >= endchar) {
10001                 RExC_parse = endchar;
10002             }
10003             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10004         }
10005
10006         if (in_char_class && has_multiple_chars) {
10007             if (strict) {
10008                 RExC_parse = endbrace;
10009                 vFAIL("\\N{} in character class restricted to one character");
10010             }
10011             else {
10012                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10013             }
10014         }
10015
10016         RExC_parse = endbrace + 1;
10017     }
10018     else if (! node_p || ! has_multiple_chars) {
10019
10020         /* Here, the input is legal, but not according to the caller's
10021          * options.  We fail without advancing the parse, so that the
10022          * caller can try again */
10023         RExC_parse = p;
10024         return FALSE;
10025     }
10026     else {
10027
10028         /* What is done here is to convert this to a sub-pattern of the form
10029          * (?:\x{char1}\x{char2}...)
10030          * and then call reg recursively.  That way, it retains its atomicness,
10031          * while not having to worry about special handling that some code
10032          * points may have.  toke.c has converted the original Unicode values
10033          * to native, so that we can just pass on the hex values unchanged.  We
10034          * do have to set a flag to keep recoding from happening in the
10035          * recursion */
10036
10037         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10038         STRLEN len;
10039         char *orig_end = RExC_end;
10040         I32 flags;
10041
10042         while (RExC_parse < endbrace) {
10043
10044             /* Convert to notation the rest of the code understands */
10045             sv_catpv(substitute_parse, "\\x{");
10046             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10047             sv_catpv(substitute_parse, "}");
10048
10049             /* Point to the beginning of the next character in the sequence. */
10050             RExC_parse = endchar + 1;
10051             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10052         }
10053         sv_catpv(substitute_parse, ")");
10054
10055         RExC_parse = SvPV(substitute_parse, len);
10056
10057         /* Don't allow empty number */
10058         if (len < 8) {
10059             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10060         }
10061         RExC_end = RExC_parse + len;
10062
10063         /* The values are Unicode, and therefore not subject to recoding */
10064         RExC_override_recoding = 1;
10065
10066         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10067             if (flags & RESTART_UTF8) {
10068                 *flagp = RESTART_UTF8;
10069                 return FALSE;
10070             }
10071             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10072                   (UV) flags);
10073         }
10074         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10075
10076         RExC_parse = endbrace;
10077         RExC_end = orig_end;
10078         RExC_override_recoding = 0;
10079
10080         nextchar(pRExC_state);
10081     }
10082
10083     return TRUE;
10084 }
10085
10086
10087 /*
10088  * reg_recode
10089  *
10090  * It returns the code point in utf8 for the value in *encp.
10091  *    value: a code value in the source encoding
10092  *    encp:  a pointer to an Encode object
10093  *
10094  * If the result from Encode is not a single character,
10095  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10096  */
10097 STATIC UV
10098 S_reg_recode(pTHX_ const char value, SV **encp)
10099 {
10100     STRLEN numlen = 1;
10101     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10102     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10103     const STRLEN newlen = SvCUR(sv);
10104     UV uv = UNICODE_REPLACEMENT;
10105
10106     PERL_ARGS_ASSERT_REG_RECODE;
10107
10108     if (newlen)
10109         uv = SvUTF8(sv)
10110              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10111              : *(U8*)s;
10112
10113     if (!newlen || numlen != newlen) {
10114         uv = UNICODE_REPLACEMENT;
10115         *encp = NULL;
10116     }
10117     return uv;
10118 }
10119
10120 PERL_STATIC_INLINE U8
10121 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10122 {
10123     U8 op;
10124
10125     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10126
10127     if (! FOLD) {
10128         return EXACT;
10129     }
10130
10131     op = get_regex_charset(RExC_flags);
10132     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10133         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10134                  been, so there is no hole */
10135     }
10136
10137     return op + EXACTF;
10138 }
10139
10140 PERL_STATIC_INLINE void
10141 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10142 {
10143     /* This knows the details about sizing an EXACTish node, setting flags for
10144      * it (by setting <*flagp>, and potentially populating it with a single
10145      * character.
10146      *
10147      * If <len> (the length in bytes) is non-zero, this function assumes that
10148      * the node has already been populated, and just does the sizing.  In this
10149      * case <code_point> should be the final code point that has already been
10150      * placed into the node.  This value will be ignored except that under some
10151      * circumstances <*flagp> is set based on it.
10152      *
10153      * If <len> is zero, the function assumes that the node is to contain only
10154      * the single character given by <code_point> and calculates what <len>
10155      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10156      * additionally will populate the node's STRING with <code_point>, if <len>
10157      * is 0.  In both cases <*flagp> is appropriately set
10158      *
10159      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10160      * 255, must be folded (the former only when the rules indicate it can
10161      * match 'ss') */
10162
10163     bool len_passed_in = cBOOL(len != 0);
10164     U8 character[UTF8_MAXBYTES_CASE+1];
10165
10166     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10167
10168     if (! len_passed_in) {
10169         if (UTF) {
10170             if (FOLD && (! LOC || code_point > 255)) {
10171                 _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
10172                                    character,
10173                                    &len,
10174                                    FOLD_FLAGS_FULL | ((LOC)
10175                                                      ? FOLD_FLAGS_LOCALE
10176                                                      : (ASCII_FOLD_RESTRICTED)
10177                                                        ? FOLD_FLAGS_NOMIX_ASCII
10178                                                        : 0));
10179             }
10180             else {
10181                 uvchr_to_utf8( character, code_point);
10182                 len = UTF8SKIP(character);
10183             }
10184         }
10185         else if (! FOLD
10186                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10187                  || ASCII_FOLD_RESTRICTED
10188                  || ! AT_LEAST_UNI_SEMANTICS)
10189         {
10190             *character = (U8) code_point;
10191             len = 1;
10192         }
10193         else {
10194             *character = 's';
10195             *(character + 1) = 's';
10196             len = 2;
10197         }
10198     }
10199
10200     if (SIZE_ONLY) {
10201         RExC_size += STR_SZ(len);
10202     }
10203     else {
10204         RExC_emit += STR_SZ(len);
10205         STR_LEN(node) = len;
10206         if (! len_passed_in) {
10207             Copy((char *) character, STRING(node), len, char);
10208         }
10209     }
10210
10211     *flagp |= HASWIDTH;
10212
10213     /* A single character node is SIMPLE, except for the special-cased SHARP S
10214      * under /di. */
10215     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10216         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10217             || ! FOLD || ! DEPENDS_SEMANTICS))
10218     {
10219         *flagp |= SIMPLE;
10220     }
10221 }
10222
10223 /*
10224  - regatom - the lowest level
10225
10226    Try to identify anything special at the start of the pattern. If there
10227    is, then handle it as required. This may involve generating a single regop,
10228    such as for an assertion; or it may involve recursing, such as to
10229    handle a () structure.
10230
10231    If the string doesn't start with something special then we gobble up
10232    as much literal text as we can.
10233
10234    Once we have been able to handle whatever type of thing started the
10235    sequence, we return.
10236
10237    Note: we have to be careful with escapes, as they can be both literal
10238    and special, and in the case of \10 and friends, context determines which.
10239
10240    A summary of the code structure is:
10241
10242    switch (first_byte) {
10243         cases for each special:
10244             handle this special;
10245             break;
10246         case '\\':
10247             switch (2nd byte) {
10248                 cases for each unambiguous special:
10249                     handle this special;
10250                     break;
10251                 cases for each ambigous special/literal:
10252                     disambiguate;
10253                     if (special)  handle here
10254                     else goto defchar;
10255                 default: // unambiguously literal:
10256                     goto defchar;
10257             }
10258         default:  // is a literal char
10259             // FALL THROUGH
10260         defchar:
10261             create EXACTish node for literal;
10262             while (more input and node isn't full) {
10263                 switch (input_byte) {
10264                    cases for each special;
10265                        make sure parse pointer is set so that the next call to
10266                            regatom will see this special first
10267                        goto loopdone; // EXACTish node terminated by prev. char
10268                    default:
10269                        append char to EXACTISH node;
10270                 }
10271                 get next input byte;
10272             }
10273         loopdone:
10274    }
10275    return the generated node;
10276
10277    Specifically there are two separate switches for handling
10278    escape sequences, with the one for handling literal escapes requiring
10279    a dummy entry for all of the special escapes that are actually handled
10280    by the other.
10281
10282    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10283    TRYAGAIN.
10284    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10285    restarted.
10286    Otherwise does not return NULL.
10287 */
10288
10289 STATIC regnode *
10290 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10291 {
10292     dVAR;
10293     regnode *ret = NULL;
10294     I32 flags = 0;
10295     char *parse_start = RExC_parse;
10296     U8 op;
10297     int invert = 0;
10298
10299     GET_RE_DEBUG_FLAGS_DECL;
10300
10301     *flagp = WORST;             /* Tentatively. */
10302
10303     DEBUG_PARSE("atom");
10304
10305     PERL_ARGS_ASSERT_REGATOM;
10306
10307 tryagain:
10308     switch ((U8)*RExC_parse) {
10309     case '^':
10310         RExC_seen_zerolen++;
10311         nextchar(pRExC_state);
10312         if (RExC_flags & RXf_PMf_MULTILINE)
10313             ret = reg_node(pRExC_state, MBOL);
10314         else if (RExC_flags & RXf_PMf_SINGLELINE)
10315             ret = reg_node(pRExC_state, SBOL);
10316         else
10317             ret = reg_node(pRExC_state, BOL);
10318         Set_Node_Length(ret, 1); /* MJD */
10319         break;
10320     case '$':
10321         nextchar(pRExC_state);
10322         if (*RExC_parse)
10323             RExC_seen_zerolen++;
10324         if (RExC_flags & RXf_PMf_MULTILINE)
10325             ret = reg_node(pRExC_state, MEOL);
10326         else if (RExC_flags & RXf_PMf_SINGLELINE)
10327             ret = reg_node(pRExC_state, SEOL);
10328         else
10329             ret = reg_node(pRExC_state, EOL);
10330         Set_Node_Length(ret, 1); /* MJD */
10331         break;
10332     case '.':
10333         nextchar(pRExC_state);
10334         if (RExC_flags & RXf_PMf_SINGLELINE)
10335             ret = reg_node(pRExC_state, SANY);
10336         else
10337             ret = reg_node(pRExC_state, REG_ANY);
10338         *flagp |= HASWIDTH|SIMPLE;
10339         RExC_naughty++;
10340         Set_Node_Length(ret, 1); /* MJD */
10341         break;
10342     case '[':
10343     {
10344         char * const oregcomp_parse = ++RExC_parse;
10345         ret = regclass(pRExC_state, flagp,depth+1,
10346                        FALSE, /* means parse the whole char class */
10347                        TRUE, /* allow multi-char folds */
10348                        FALSE, /* don't silence non-portable warnings. */
10349                        NULL);
10350         if (*RExC_parse != ']') {
10351             RExC_parse = oregcomp_parse;
10352             vFAIL("Unmatched [");
10353         }
10354         if (ret == NULL) {
10355             if (*flagp & RESTART_UTF8)
10356                 return NULL;
10357             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10358                   (UV) *flagp);
10359         }
10360         nextchar(pRExC_state);
10361         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10362         break;
10363     }
10364     case '(':
10365         nextchar(pRExC_state);
10366         ret = reg(pRExC_state, 2, &flags,depth+1);
10367         if (ret == NULL) {
10368                 if (flags & TRYAGAIN) {
10369                     if (RExC_parse == RExC_end) {
10370                          /* Make parent create an empty node if needed. */
10371                         *flagp |= TRYAGAIN;
10372                         return(NULL);
10373                     }
10374                     goto tryagain;
10375                 }
10376                 if (flags & RESTART_UTF8) {
10377                     *flagp = RESTART_UTF8;
10378                     return NULL;
10379                 }
10380                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10381         }
10382         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10383         break;
10384     case '|':
10385     case ')':
10386         if (flags & TRYAGAIN) {
10387             *flagp |= TRYAGAIN;
10388             return NULL;
10389         }
10390         vFAIL("Internal urp");
10391                                 /* Supposed to be caught earlier. */
10392         break;
10393     case '{':
10394         if (!regcurly(RExC_parse, FALSE)) {
10395             RExC_parse++;
10396             goto defchar;
10397         }
10398         /* FALL THROUGH */
10399     case '?':
10400     case '+':
10401     case '*':
10402         RExC_parse++;
10403         vFAIL("Quantifier follows nothing");
10404         break;
10405     case '\\':
10406         /* Special Escapes
10407
10408            This switch handles escape sequences that resolve to some kind
10409            of special regop and not to literal text. Escape sequnces that
10410            resolve to literal text are handled below in the switch marked
10411            "Literal Escapes".
10412
10413            Every entry in this switch *must* have a corresponding entry
10414            in the literal escape switch. However, the opposite is not
10415            required, as the default for this switch is to jump to the
10416            literal text handling code.
10417         */
10418         switch ((U8)*++RExC_parse) {
10419             U8 arg;
10420         /* Special Escapes */
10421         case 'A':
10422             RExC_seen_zerolen++;
10423             ret = reg_node(pRExC_state, SBOL);
10424             *flagp |= SIMPLE;
10425             goto finish_meta_pat;
10426         case 'G':
10427             ret = reg_node(pRExC_state, GPOS);
10428             RExC_seen |= REG_SEEN_GPOS;
10429             *flagp |= SIMPLE;
10430             goto finish_meta_pat;
10431         case 'K':
10432             RExC_seen_zerolen++;
10433             ret = reg_node(pRExC_state, KEEPS);
10434             *flagp |= SIMPLE;
10435             /* XXX:dmq : disabling in-place substitution seems to
10436              * be necessary here to avoid cases of memory corruption, as
10437              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10438              */
10439             RExC_seen |= REG_SEEN_LOOKBEHIND;
10440             goto finish_meta_pat;
10441         case 'Z':
10442             ret = reg_node(pRExC_state, SEOL);
10443             *flagp |= SIMPLE;
10444             RExC_seen_zerolen++;                /* Do not optimize RE away */
10445             goto finish_meta_pat;
10446         case 'z':
10447             ret = reg_node(pRExC_state, EOS);
10448             *flagp |= SIMPLE;
10449             RExC_seen_zerolen++;                /* Do not optimize RE away */
10450             goto finish_meta_pat;
10451         case 'C':
10452             ret = reg_node(pRExC_state, CANY);
10453             RExC_seen |= REG_SEEN_CANY;
10454             *flagp |= HASWIDTH|SIMPLE;
10455             goto finish_meta_pat;
10456         case 'X':
10457             ret = reg_node(pRExC_state, CLUMP);
10458             *flagp |= HASWIDTH;
10459             goto finish_meta_pat;
10460
10461         case 'W':
10462             invert = 1;
10463             /* FALLTHROUGH */
10464         case 'w':
10465             arg = ANYOF_WORDCHAR;
10466             goto join_posix;
10467
10468         case 'b':
10469             RExC_seen_zerolen++;
10470             RExC_seen |= REG_SEEN_LOOKBEHIND;
10471             op = BOUND + get_regex_charset(RExC_flags);
10472             if (op > BOUNDA) {  /* /aa is same as /a */
10473                 op = BOUNDA;
10474             }
10475             ret = reg_node(pRExC_state, op);
10476             FLAGS(ret) = get_regex_charset(RExC_flags);
10477             *flagp |= SIMPLE;
10478             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10479                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10480             }
10481             goto finish_meta_pat;
10482         case 'B':
10483             RExC_seen_zerolen++;
10484             RExC_seen |= REG_SEEN_LOOKBEHIND;
10485             op = NBOUND + get_regex_charset(RExC_flags);
10486             if (op > NBOUNDA) { /* /aa is same as /a */
10487                 op = NBOUNDA;
10488             }
10489             ret = reg_node(pRExC_state, op);
10490             FLAGS(ret) = get_regex_charset(RExC_flags);
10491             *flagp |= SIMPLE;
10492             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10493                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10494             }
10495             goto finish_meta_pat;
10496
10497         case 'D':
10498             invert = 1;
10499             /* FALLTHROUGH */
10500         case 'd':
10501             arg = ANYOF_DIGIT;
10502             goto join_posix;
10503
10504         case 'R':
10505             ret = reg_node(pRExC_state, LNBREAK);
10506             *flagp |= HASWIDTH|SIMPLE;
10507             goto finish_meta_pat;
10508
10509         case 'H':
10510             invert = 1;
10511             /* FALLTHROUGH */
10512         case 'h':
10513             arg = ANYOF_BLANK;
10514             op = POSIXU;
10515             goto join_posix_op_known;
10516
10517         case 'V':
10518             invert = 1;
10519             /* FALLTHROUGH */
10520         case 'v':
10521             arg = ANYOF_VERTWS;
10522             op = POSIXU;
10523             goto join_posix_op_known;
10524
10525         case 'S':
10526             invert = 1;
10527             /* FALLTHROUGH */
10528         case 's':
10529             arg = ANYOF_SPACE;
10530
10531         join_posix:
10532
10533             op = POSIXD + get_regex_charset(RExC_flags);
10534             if (op > POSIXA) {  /* /aa is same as /a */
10535                 op = POSIXA;
10536             }
10537
10538         join_posix_op_known:
10539
10540             if (invert) {
10541                 op += NPOSIXD - POSIXD;
10542             }
10543
10544             ret = reg_node(pRExC_state, op);
10545             if (! SIZE_ONLY) {
10546                 FLAGS(ret) = namedclass_to_classnum(arg);
10547             }
10548
10549             *flagp |= HASWIDTH|SIMPLE;
10550             /* FALL THROUGH */
10551
10552          finish_meta_pat:
10553             nextchar(pRExC_state);
10554             Set_Node_Length(ret, 2); /* MJD */
10555             break;
10556         case 'p':
10557         case 'P':
10558             {
10559 #ifdef DEBUGGING
10560                 char* parse_start = RExC_parse - 2;
10561 #endif
10562
10563                 RExC_parse--;
10564
10565                 ret = regclass(pRExC_state, flagp,depth+1,
10566                                TRUE, /* means just parse this element */
10567                                FALSE, /* don't allow multi-char folds */
10568                                FALSE, /* don't silence non-portable warnings.
10569                                          It would be a bug if these returned
10570                                          non-portables */
10571                                NULL);
10572                 /* regclass() can only return RESTART_UTF8 if multi-char folds
10573                    are allowed.  */
10574                 if (!ret)
10575                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10576                           (UV) *flagp);
10577
10578                 RExC_parse--;
10579
10580                 Set_Node_Offset(ret, parse_start + 2);
10581                 Set_Node_Cur_Length(ret, parse_start);
10582                 nextchar(pRExC_state);
10583             }
10584             break;
10585         case 'N':
10586             /* Handle \N and \N{NAME} with multiple code points here and not
10587              * below because it can be multicharacter. join_exact() will join
10588              * them up later on.  Also this makes sure that things like
10589              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10590              * The options to the grok function call causes it to fail if the
10591              * sequence is just a single code point.  We then go treat it as
10592              * just another character in the current EXACT node, and hence it
10593              * gets uniform treatment with all the other characters.  The
10594              * special treatment for quantifiers is not needed for such single
10595              * character sequences */
10596             ++RExC_parse;
10597             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10598                                 FALSE /* not strict */ )) {
10599                 if (*flagp & RESTART_UTF8)
10600                     return NULL;
10601                 RExC_parse--;
10602                 goto defchar;
10603             }
10604             break;
10605         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10606         parse_named_seq:
10607         {
10608             char ch= RExC_parse[1];
10609             if (ch != '<' && ch != '\'' && ch != '{') {
10610                 RExC_parse++;
10611                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10612             } else {
10613                 /* this pretty much dupes the code for (?P=...) in reg(), if
10614                    you change this make sure you change that */
10615                 char* name_start = (RExC_parse += 2);
10616                 U32 num = 0;
10617                 SV *sv_dat = reg_scan_name(pRExC_state,
10618                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10619                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10620                 if (RExC_parse == name_start || *RExC_parse != ch)
10621                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10622
10623                 if (!SIZE_ONLY) {
10624                     num = add_data( pRExC_state, 1, "S" );
10625                     RExC_rxi->data->data[num]=(void*)sv_dat;
10626                     SvREFCNT_inc_simple_void(sv_dat);
10627                 }
10628
10629                 RExC_sawback = 1;
10630                 ret = reganode(pRExC_state,
10631                                ((! FOLD)
10632                                  ? NREF
10633                                  : (ASCII_FOLD_RESTRICTED)
10634                                    ? NREFFA
10635                                    : (AT_LEAST_UNI_SEMANTICS)
10636                                      ? NREFFU
10637                                      : (LOC)
10638                                        ? NREFFL
10639                                        : NREFF),
10640                                 num);
10641                 *flagp |= HASWIDTH;
10642
10643                 /* override incorrect value set in reganode MJD */
10644                 Set_Node_Offset(ret, parse_start+1);
10645                 Set_Node_Cur_Length(ret, parse_start);
10646                 nextchar(pRExC_state);
10647
10648             }
10649             break;
10650         }
10651         case 'g':
10652         case '1': case '2': case '3': case '4':
10653         case '5': case '6': case '7': case '8': case '9':
10654             {
10655                 I32 num;
10656                 bool isg = *RExC_parse == 'g';
10657                 bool isrel = 0;
10658                 bool hasbrace = 0;
10659                 if (isg) {
10660                     RExC_parse++;
10661                     if (*RExC_parse == '{') {
10662                         RExC_parse++;
10663                         hasbrace = 1;
10664                     }
10665                     if (*RExC_parse == '-') {
10666                         RExC_parse++;
10667                         isrel = 1;
10668                     }
10669                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10670                         if (isrel) RExC_parse--;
10671                         RExC_parse -= 2;
10672                         goto parse_named_seq;
10673                 }   }
10674                 num = atoi(RExC_parse);
10675                 if (isg && num == 0) {
10676                     if (*RExC_parse == '0') {
10677                         vFAIL("Reference to invalid group 0");
10678                     }
10679                     else {
10680                         vFAIL("Unterminated \\g... pattern");
10681                     }
10682                 }
10683                 if (isrel) {
10684                     num = RExC_npar - num;
10685                     if (num < 1)
10686                         vFAIL("Reference to nonexistent or unclosed group");
10687                 }
10688                 if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
10689                     /* Probably a character specified in octal, e.g. \35 */
10690                     goto defchar;
10691                 else {
10692 #ifdef RE_TRACK_PATTERN_OFFSETS
10693                     char * const parse_start = RExC_parse - 1; /* MJD */
10694 #endif
10695                     while (isDIGIT(*RExC_parse))
10696                         RExC_parse++;
10697                     if (hasbrace) {
10698                         if (*RExC_parse != '}')
10699                             vFAIL("Unterminated \\g{...} pattern");
10700                         RExC_parse++;
10701                     }
10702                     if (!SIZE_ONLY) {
10703                         if (num > (I32)RExC_rx->nparens)
10704                             vFAIL("Reference to nonexistent group");
10705                     }
10706                     RExC_sawback = 1;
10707                     ret = reganode(pRExC_state,
10708                                    ((! FOLD)
10709                                      ? REF
10710                                      : (ASCII_FOLD_RESTRICTED)
10711                                        ? REFFA
10712                                        : (AT_LEAST_UNI_SEMANTICS)
10713                                          ? REFFU
10714                                          : (LOC)
10715                                            ? REFFL
10716                                            : REFF),
10717                                     num);
10718                     *flagp |= HASWIDTH;
10719
10720                     /* override incorrect value set in reganode MJD */
10721                     Set_Node_Offset(ret, parse_start+1);
10722                     Set_Node_Cur_Length(ret, parse_start);
10723                     RExC_parse--;
10724                     nextchar(pRExC_state);
10725                 }
10726             }
10727             break;
10728         case '\0':
10729             if (RExC_parse >= RExC_end)
10730                 FAIL("Trailing \\");
10731             /* FALL THROUGH */
10732         default:
10733             /* Do not generate "unrecognized" warnings here, we fall
10734                back into the quick-grab loop below */
10735             parse_start--;
10736             goto defchar;
10737         }
10738         break;
10739
10740     case '#':
10741         if (RExC_flags & RXf_PMf_EXTENDED) {
10742             if ( reg_skipcomment( pRExC_state ) )
10743                 goto tryagain;
10744         }
10745         /* FALL THROUGH */
10746
10747     default:
10748
10749             parse_start = RExC_parse - 1;
10750
10751             RExC_parse++;
10752
10753         defchar: {
10754             STRLEN len = 0;
10755             UV ender = 0;
10756             char *p;
10757             char *s;
10758 #define MAX_NODE_STRING_SIZE 127
10759             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10760             char *s0;
10761             U8 upper_parse = MAX_NODE_STRING_SIZE;
10762             STRLEN foldlen;
10763             U8 node_type = compute_EXACTish(pRExC_state);
10764             bool next_is_quantifier;
10765             char * oldp = NULL;
10766
10767             /* We can convert EXACTF nodes to EXACTFU if they contain only
10768              * characters that match identically regardless of the target
10769              * string's UTF8ness.  The reason to do this is that EXACTF is not
10770              * trie-able, EXACTFU is.  (We don't need to figure this out until
10771              * pass 2) */
10772             bool maybe_exactfu = node_type == EXACTF && PASS2;
10773
10774             /* If a folding node contains only code points that don't
10775              * participate in folds, it can be changed into an EXACT node,
10776              * which allows the optimizer more things to look for */
10777             bool maybe_exact;
10778
10779             ret = reg_node(pRExC_state, node_type);
10780
10781             /* In pass1, folded, we use a temporary buffer instead of the
10782              * actual node, as the node doesn't exist yet */
10783             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10784
10785             s0 = s;
10786
10787         reparse:
10788
10789             /* We do the EXACTFish to EXACT node only if folding, and not if in
10790              * locale, as whether a character folds or not isn't known until
10791              * runtime.  (And we don't need to figure this out until pass 2) */
10792             maybe_exact = FOLD && ! LOC && PASS2;
10793
10794             /* XXX The node can hold up to 255 bytes, yet this only goes to
10795              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10796              * 255 allows us to not have to worry about overflow due to
10797              * converting to utf8 and fold expansion, but that value is
10798              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10799              * split up by this limit into a single one using the real max of
10800              * 255.  Even at 127, this breaks under rare circumstances.  If
10801              * folding, we do not want to split a node at a character that is a
10802              * non-final in a multi-char fold, as an input string could just
10803              * happen to want to match across the node boundary.  The join
10804              * would solve that problem if the join actually happens.  But a
10805              * series of more than two nodes in a row each of 127 would cause
10806              * the first join to succeed to get to 254, but then there wouldn't
10807              * be room for the next one, which could at be one of those split
10808              * multi-char folds.  I don't know of any fool-proof solution.  One
10809              * could back off to end with only a code point that isn't such a
10810              * non-final, but it is possible for there not to be any in the
10811              * entire node. */
10812             for (p = RExC_parse - 1;
10813                  len < upper_parse && p < RExC_end;
10814                  len++)
10815             {
10816                 oldp = p;
10817
10818                 if (RExC_flags & RXf_PMf_EXTENDED)
10819                     p = regwhite( pRExC_state, p );
10820                 switch ((U8)*p) {
10821                 case '^':
10822                 case '$':
10823                 case '.':
10824                 case '[':
10825                 case '(':
10826                 case ')':
10827                 case '|':
10828                     goto loopdone;
10829                 case '\\':
10830                     /* Literal Escapes Switch
10831
10832                        This switch is meant to handle escape sequences that
10833                        resolve to a literal character.
10834
10835                        Every escape sequence that represents something
10836                        else, like an assertion or a char class, is handled
10837                        in the switch marked 'Special Escapes' above in this
10838                        routine, but also has an entry here as anything that
10839                        isn't explicitly mentioned here will be treated as
10840                        an unescaped equivalent literal.
10841                     */
10842
10843                     switch ((U8)*++p) {
10844                     /* These are all the special escapes. */
10845                     case 'A':             /* Start assertion */
10846                     case 'b': case 'B':   /* Word-boundary assertion*/
10847                     case 'C':             /* Single char !DANGEROUS! */
10848                     case 'd': case 'D':   /* digit class */
10849                     case 'g': case 'G':   /* generic-backref, pos assertion */
10850                     case 'h': case 'H':   /* HORIZWS */
10851                     case 'k': case 'K':   /* named backref, keep marker */
10852                     case 'p': case 'P':   /* Unicode property */
10853                               case 'R':   /* LNBREAK */
10854                     case 's': case 'S':   /* space class */
10855                     case 'v': case 'V':   /* VERTWS */
10856                     case 'w': case 'W':   /* word class */
10857                     case 'X':             /* eXtended Unicode "combining character sequence" */
10858                     case 'z': case 'Z':   /* End of line/string assertion */
10859                         --p;
10860                         goto loopdone;
10861
10862                     /* Anything after here is an escape that resolves to a
10863                        literal. (Except digits, which may or may not)
10864                      */
10865                     case 'n':
10866                         ender = '\n';
10867                         p++;
10868                         break;
10869                     case 'N': /* Handle a single-code point named character. */
10870                         /* The options cause it to fail if a multiple code
10871                          * point sequence.  Handle those in the switch() above
10872                          * */
10873                         RExC_parse = p + 1;
10874                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10875                                             flagp, depth, FALSE,
10876                                             FALSE /* not strict */ ))
10877                         {
10878                             if (*flagp & RESTART_UTF8)
10879                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
10880                             RExC_parse = p = oldp;
10881                             goto loopdone;
10882                         }
10883                         p = RExC_parse;
10884                         if (ender > 0xff) {
10885                             REQUIRE_UTF8;
10886                         }
10887                         break;
10888                     case 'r':
10889                         ender = '\r';
10890                         p++;
10891                         break;
10892                     case 't':
10893                         ender = '\t';
10894                         p++;
10895                         break;
10896                     case 'f':
10897                         ender = '\f';
10898                         p++;
10899                         break;
10900                     case 'e':
10901                           ender = ASCII_TO_NATIVE('\033');
10902                         p++;
10903                         break;
10904                     case 'a':
10905                           ender = ASCII_TO_NATIVE('\007');
10906                         p++;
10907                         break;
10908                     case 'o':
10909                         {
10910                             UV result;
10911                             const char* error_msg;
10912
10913                             bool valid = grok_bslash_o(&p,
10914                                                        &result,
10915                                                        &error_msg,
10916                                                        TRUE, /* out warnings */
10917                                                        FALSE, /* not strict */
10918                                                        TRUE, /* Output warnings
10919                                                                 for non-
10920                                                                 portables */
10921                                                        UTF);
10922                             if (! valid) {
10923                                 RExC_parse = p; /* going to die anyway; point
10924                                                    to exact spot of failure */
10925                                 vFAIL(error_msg);
10926                             }
10927                             ender = result;
10928                             if (PL_encoding && ender < 0x100) {
10929                                 goto recode_encoding;
10930                             }
10931                             if (ender > 0xff) {
10932                                 REQUIRE_UTF8;
10933                             }
10934                             break;
10935                         }
10936                     case 'x':
10937                         {
10938                             UV result = UV_MAX; /* initialize to erroneous
10939                                                    value */
10940                             const char* error_msg;
10941
10942                             bool valid = grok_bslash_x(&p,
10943                                                        &result,
10944                                                        &error_msg,
10945                                                        TRUE, /* out warnings */
10946                                                        FALSE, /* not strict */
10947                                                        TRUE, /* Output warnings
10948                                                                 for non-
10949                                                                 portables */
10950                                                        UTF);
10951                             if (! valid) {
10952                                 RExC_parse = p; /* going to die anyway; point
10953                                                    to exact spot of failure */
10954                                 vFAIL(error_msg);
10955                             }
10956                             ender = result;
10957
10958                             if (PL_encoding && ender < 0x100) {
10959                                 goto recode_encoding;
10960                             }
10961                             if (ender > 0xff) {
10962                                 REQUIRE_UTF8;
10963                             }
10964                             break;
10965                         }
10966                     case 'c':
10967                         p++;
10968                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10969                         break;
10970                     case '8': case '9': /* must be a backreference */
10971                         --p;
10972                         goto loopdone;
10973                     case '1': case '2': case '3':case '4':
10974                     case '5': case '6': case '7':
10975                         /* When we parse backslash escapes there is ambiguity between
10976                          * backreferences and octal escapes. Any escape from \1 - \9 is
10977                          * a backreference, any multi-digit escape which does not start with
10978                          * 0 and which when evaluated as decimal could refer to an already
10979                          * parsed capture buffer is a backslash. Anything else is octal.
10980                          *
10981                          * Note this implies that \118 could be interpreted as 118 OR as
10982                          * "\11" . "8" depending on whether there were 118 capture buffers
10983                          * defined already in the pattern.
10984                          */
10985                         if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
10986                         {  /* Not to be treated as an octal constant, go
10987                                    find backref */
10988                             --p;
10989                             goto loopdone;
10990                         }
10991                     case '0':
10992                         {
10993                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10994                             STRLEN numlen = 3;
10995                             ender = grok_oct(p, &numlen, &flags, NULL);
10996                             if (ender > 0xff) {
10997                                 REQUIRE_UTF8;
10998                             }
10999                             p += numlen;
11000                             if (SIZE_ONLY   /* like \08, \178 */
11001                                 && numlen < 3
11002                                 && p < RExC_end
11003                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11004                             {
11005                                 reg_warn_non_literal_string(
11006                                          p + 1,
11007                                          form_short_octal_warning(p, numlen));
11008                             }
11009                         }
11010                         if (PL_encoding && ender < 0x100)
11011                             goto recode_encoding;
11012                         break;
11013                     recode_encoding:
11014                         if (! RExC_override_recoding) {
11015                             SV* enc = PL_encoding;
11016                             ender = reg_recode((const char)(U8)ender, &enc);
11017                             if (!enc && SIZE_ONLY)
11018                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11019                             REQUIRE_UTF8;
11020                         }
11021                         break;
11022                     case '\0':
11023                         if (p >= RExC_end)
11024                             FAIL("Trailing \\");
11025                         /* FALL THROUGH */
11026                     default:
11027                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11028                             /* Include any { following the alpha to emphasize
11029                              * that it could be part of an escape at some point
11030                              * in the future */
11031                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11032                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11033                         }
11034                         goto normal_default;
11035                     } /* End of switch on '\' */
11036                     break;
11037                 default:    /* A literal character */
11038
11039                     if (! SIZE_ONLY
11040                         && RExC_flags & RXf_PMf_EXTENDED
11041                         && ckWARN_d(WARN_DEPRECATED)
11042                         && is_PATWS_non_low(p, UTF))
11043                     {
11044                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11045                                 "Escape literal pattern white space under /x");
11046                     }
11047
11048                   normal_default:
11049                     if (UTF8_IS_START(*p) && UTF) {
11050                         STRLEN numlen;
11051                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11052                                                &numlen, UTF8_ALLOW_DEFAULT);
11053                         p += numlen;
11054                     }
11055                     else
11056                         ender = (U8) *p++;
11057                     break;
11058                 } /* End of switch on the literal */
11059
11060                 /* Here, have looked at the literal character and <ender>
11061                  * contains its ordinal, <p> points to the character after it
11062                  */
11063
11064                 if ( RExC_flags & RXf_PMf_EXTENDED)
11065                     p = regwhite( pRExC_state, p );
11066
11067                 /* If the next thing is a quantifier, it applies to this
11068                  * character only, which means that this character has to be in
11069                  * its own node and can't just be appended to the string in an
11070                  * existing node, so if there are already other characters in
11071                  * the node, close the node with just them, and set up to do
11072                  * this character again next time through, when it will be the
11073                  * only thing in its new node */
11074                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11075                 {
11076                     p = oldp;
11077                     goto loopdone;
11078                 }
11079
11080                 if (! FOLD) {
11081                     if (UTF) {
11082                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11083                         if (unilen > 0) {
11084                            s   += unilen;
11085                            len += unilen;
11086                         }
11087
11088                         /* The loop increments <len> each time, as all but this
11089                          * path (and one other) through it add a single byte to
11090                          * the EXACTish node.  But this one has changed len to
11091                          * be the correct final value, so subtract one to
11092                          * cancel out the increment that follows */
11093                         len--;
11094                     }
11095                     else {
11096                         REGC((char)ender, s++);
11097                     }
11098                 }
11099                 else /* FOLD */
11100                      if (! ( UTF
11101                         /* See comments for join_exact() as to why we fold this
11102                          * non-UTF at compile time */
11103                         || (node_type == EXACTFU
11104                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
11105                 {
11106                     if (IS_IN_SOME_FOLD_L1(ender)) {
11107                         maybe_exact = FALSE;
11108
11109                         /* See if the character's fold differs between /d and
11110                          * /u.  This includes the multi-char fold SHARP S to
11111                          * 'ss' */
11112                         if (maybe_exactfu
11113                             && (PL_fold[ender] != PL_fold_latin1[ender]
11114                                 || ender == LATIN_SMALL_LETTER_SHARP_S
11115                                 || (len > 0
11116                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
11117                                    && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11118                         {
11119                             maybe_exactfu = FALSE;
11120                         }
11121                     }
11122                     *(s++) = (char) ender;
11123                 }
11124                 else {  /* UTF */
11125
11126                     /* Prime the casefolded buffer.  Locale rules, which apply
11127                      * only to code points < 256, aren't known until execution,
11128                      * so for them, just output the original character using
11129                      * utf8.  If we start to fold non-UTF patterns, be sure to
11130                      * update join_exact() */
11131                     if (LOC && ender < 256) {
11132                         if (UNI_IS_INVARIANT(ender)) {
11133                             *s = (U8) ender;
11134                             foldlen = 1;
11135                         } else {
11136                             *s = UTF8_TWO_BYTE_HI(ender);
11137                             *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11138                             foldlen = 2;
11139                         }
11140                     }
11141                     else {
11142                         UV folded = _to_uni_fold_flags(
11143                                        ender,
11144                                        (U8 *) s,
11145                                        &foldlen,
11146                                        FOLD_FLAGS_FULL
11147                                        | ((LOC) ?  FOLD_FLAGS_LOCALE
11148                                                 : (ASCII_FOLD_RESTRICTED)
11149                                                   ? FOLD_FLAGS_NOMIX_ASCII
11150                                                   : 0)
11151                                         );
11152
11153                         /* If this node only contains non-folding code points
11154                          * so far, see if this new one is also non-folding */
11155                         if (maybe_exact) {
11156                             if (folded != ender) {
11157                                 maybe_exact = FALSE;
11158                             }
11159                             else {
11160                                 /* Here the fold is the original; we have
11161                                  * to check further to see if anything
11162                                  * folds to it */
11163                                 if (! PL_utf8_foldable) {
11164                                     SV* swash = swash_init("utf8",
11165                                                        "_Perl_Any_Folds",
11166                                                        &PL_sv_undef, 1, 0);
11167                                     PL_utf8_foldable =
11168                                                 _get_swash_invlist(swash);
11169                                     SvREFCNT_dec_NN(swash);
11170                                 }
11171                                 if (_invlist_contains_cp(PL_utf8_foldable,
11172                                                          ender))
11173                                 {
11174                                     maybe_exact = FALSE;
11175                                 }
11176                             }
11177                         }
11178                         ender = folded;
11179                     }
11180                     s += foldlen;
11181
11182                     /* The loop increments <len> each time, as all but this
11183                      * path (and one other) through it add a single byte to the
11184                      * EXACTish node.  But this one has changed len to be the
11185                      * correct final value, so subtract one to cancel out the
11186                      * increment that follows */
11187                     len += foldlen - 1;
11188                 }
11189
11190                 if (next_is_quantifier) {
11191
11192                     /* Here, the next input is a quantifier, and to get here,
11193                      * the current character is the only one in the node.
11194                      * Also, here <len> doesn't include the final byte for this
11195                      * character */
11196                     len++;
11197                     goto loopdone;
11198                 }
11199
11200             } /* End of loop through literal characters */
11201
11202             /* Here we have either exhausted the input or ran out of room in
11203              * the node.  (If we encountered a character that can't be in the
11204              * node, transfer is made directly to <loopdone>, and so we
11205              * wouldn't have fallen off the end of the loop.)  In the latter
11206              * case, we artificially have to split the node into two, because
11207              * we just don't have enough space to hold everything.  This
11208              * creates a problem if the final character participates in a
11209              * multi-character fold in the non-final position, as a match that
11210              * should have occurred won't, due to the way nodes are matched,
11211              * and our artificial boundary.  So back off until we find a non-
11212              * problematic character -- one that isn't at the beginning or
11213              * middle of such a fold.  (Either it doesn't participate in any
11214              * folds, or appears only in the final position of all the folds it
11215              * does participate in.)  A better solution with far fewer false
11216              * positives, and that would fill the nodes more completely, would
11217              * be to actually have available all the multi-character folds to
11218              * test against, and to back-off only far enough to be sure that
11219              * this node isn't ending with a partial one.  <upper_parse> is set
11220              * further below (if we need to reparse the node) to include just
11221              * up through that final non-problematic character that this code
11222              * identifies, so when it is set to less than the full node, we can
11223              * skip the rest of this */
11224             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11225
11226                 const STRLEN full_len = len;
11227
11228                 assert(len >= MAX_NODE_STRING_SIZE);
11229
11230                 /* Here, <s> points to the final byte of the final character.
11231                  * Look backwards through the string until find a non-
11232                  * problematic character */
11233
11234                 if (! UTF) {
11235
11236                     /* These two have no multi-char folds to non-UTF characters
11237                      */
11238                     if (ASCII_FOLD_RESTRICTED || LOC) {
11239                         goto loopdone;
11240                     }
11241
11242                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11243                     len = s - s0 + 1;
11244                 }
11245                 else {
11246                     if (!  PL_NonL1NonFinalFold) {
11247                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11248                                         NonL1_Perl_Non_Final_Folds_invlist);
11249                     }
11250
11251                     /* Point to the first byte of the final character */
11252                     s = (char *) utf8_hop((U8 *) s, -1);
11253
11254                     while (s >= s0) {   /* Search backwards until find
11255                                            non-problematic char */
11256                         if (UTF8_IS_INVARIANT(*s)) {
11257
11258                             /* There are no ascii characters that participate
11259                              * in multi-char folds under /aa.  In EBCDIC, the
11260                              * non-ascii invariants are all control characters,
11261                              * so don't ever participate in any folds. */
11262                             if (ASCII_FOLD_RESTRICTED
11263                                 || ! IS_NON_FINAL_FOLD(*s))
11264                             {
11265                                 break;
11266                             }
11267                         }
11268                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11269
11270                             /* No Latin1 characters participate in multi-char
11271                              * folds under /l */
11272                             if (LOC
11273                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11274                                                                 *s, *(s+1))))
11275                             {
11276                                 break;
11277                             }
11278                         }
11279                         else if (! _invlist_contains_cp(
11280                                         PL_NonL1NonFinalFold,
11281                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11282                         {
11283                             break;
11284                         }
11285
11286                         /* Here, the current character is problematic in that
11287                          * it does occur in the non-final position of some
11288                          * fold, so try the character before it, but have to
11289                          * special case the very first byte in the string, so
11290                          * we don't read outside the string */
11291                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11292                     } /* End of loop backwards through the string */
11293
11294                     /* If there were only problematic characters in the string,
11295                      * <s> will point to before s0, in which case the length
11296                      * should be 0, otherwise include the length of the
11297                      * non-problematic character just found */
11298                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11299                 }
11300
11301                 /* Here, have found the final character, if any, that is
11302                  * non-problematic as far as ending the node without splitting
11303                  * it across a potential multi-char fold.  <len> contains the
11304                  * number of bytes in the node up-to and including that
11305                  * character, or is 0 if there is no such character, meaning
11306                  * the whole node contains only problematic characters.  In
11307                  * this case, give up and just take the node as-is.  We can't
11308                  * do any better */
11309                 if (len == 0) {
11310                     len = full_len;
11311
11312                     /* If the node ends in an 's' we make sure it stays EXACTF,
11313                      * as if it turns into an EXACTFU, it could later get
11314                      * joined with another 's' that would then wrongly match
11315                      * the sharp s */
11316                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11317                     {
11318                         maybe_exactfu = FALSE;
11319                     }
11320                 } else {
11321
11322                     /* Here, the node does contain some characters that aren't
11323                      * problematic.  If one such is the final character in the
11324                      * node, we are done */
11325                     if (len == full_len) {
11326                         goto loopdone;
11327                     }
11328                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11329
11330                         /* If the final character is problematic, but the
11331                          * penultimate is not, back-off that last character to
11332                          * later start a new node with it */
11333                         p = oldp;
11334                         goto loopdone;
11335                     }
11336
11337                     /* Here, the final non-problematic character is earlier
11338                      * in the input than the penultimate character.  What we do
11339                      * is reparse from the beginning, going up only as far as
11340                      * this final ok one, thus guaranteeing that the node ends
11341                      * in an acceptable character.  The reason we reparse is
11342                      * that we know how far in the character is, but we don't
11343                      * know how to correlate its position with the input parse.
11344                      * An alternate implementation would be to build that
11345                      * correlation as we go along during the original parse,
11346                      * but that would entail extra work for every node, whereas
11347                      * this code gets executed only when the string is too
11348                      * large for the node, and the final two characters are
11349                      * problematic, an infrequent occurrence.  Yet another
11350                      * possible strategy would be to save the tail of the
11351                      * string, and the next time regatom is called, initialize
11352                      * with that.  The problem with this is that unless you
11353                      * back off one more character, you won't be guaranteed
11354                      * regatom will get called again, unless regbranch,
11355                      * regpiece ... are also changed.  If you do back off that
11356                      * extra character, so that there is input guaranteed to
11357                      * force calling regatom, you can't handle the case where
11358                      * just the first character in the node is acceptable.  I
11359                      * (khw) decided to try this method which doesn't have that
11360                      * pitfall; if performance issues are found, we can do a
11361                      * combination of the current approach plus that one */
11362                     upper_parse = len;
11363                     len = 0;
11364                     s = s0;
11365                     goto reparse;
11366                 }
11367             }   /* End of verifying node ends with an appropriate char */
11368
11369         loopdone:   /* Jumped to when encounters something that shouldn't be in
11370                        the node */
11371
11372             /* I (khw) don't know if you can get here with zero length, but the
11373              * old code handled this situation by creating a zero-length EXACT
11374              * node.  Might as well be NOTHING instead */
11375             if (len == 0) {
11376                 OP(ret) = NOTHING;
11377             }
11378             else {
11379                 if (FOLD) {
11380                     /* If 'maybe_exact' is still set here, means there are no
11381                      * code points in the node that participate in folds;
11382                      * similarly for 'maybe_exactfu' and code points that match
11383                      * differently depending on UTF8ness of the target string
11384                      * */
11385                     if (maybe_exact) {
11386                         OP(ret) = EXACT;
11387                     }
11388                     else if (maybe_exactfu) {
11389                         OP(ret) = EXACTFU;
11390                     }
11391                 }
11392                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11393             }
11394
11395             RExC_parse = p - 1;
11396             Set_Node_Cur_Length(ret, parse_start);
11397             nextchar(pRExC_state);
11398             {
11399                 /* len is STRLEN which is unsigned, need to copy to signed */
11400                 IV iv = len;
11401                 if (iv < 0)
11402                     vFAIL("Internal disaster");
11403             }
11404
11405         } /* End of label 'defchar:' */
11406         break;
11407     } /* End of giant switch on input character */
11408
11409     return(ret);
11410 }
11411
11412 STATIC char *
11413 S_regwhite( RExC_state_t *pRExC_state, char *p )
11414 {
11415     const char *e = RExC_end;
11416
11417     PERL_ARGS_ASSERT_REGWHITE;
11418
11419     while (p < e) {
11420         if (isSPACE(*p))
11421             ++p;
11422         else if (*p == '#') {
11423             bool ended = 0;
11424             do {
11425                 if (*p++ == '\n') {
11426                     ended = 1;
11427                     break;
11428                 }
11429             } while (p < e);
11430             if (!ended)
11431                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11432         }
11433         else
11434             break;
11435     }
11436     return p;
11437 }
11438
11439 STATIC char *
11440 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11441 {
11442     /* Returns the next non-pattern-white space, non-comment character (the
11443      * latter only if 'recognize_comment is true) in the string p, which is
11444      * ended by RExC_end.  If there is no line break ending a comment,
11445      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11446     const char *e = RExC_end;
11447
11448     PERL_ARGS_ASSERT_REGPATWS;
11449
11450     while (p < e) {
11451         STRLEN len;
11452         if ((len = is_PATWS_safe(p, e, UTF))) {
11453             p += len;
11454         }
11455         else if (recognize_comment && *p == '#') {
11456             bool ended = 0;
11457             do {
11458                 p++;
11459                 if (is_LNBREAK_safe(p, e, UTF)) {
11460                     ended = 1;
11461                     break;
11462                 }
11463             } while (p < e);
11464             if (!ended)
11465                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11466         }
11467         else
11468             break;
11469     }
11470     return p;
11471 }
11472
11473 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11474    Character classes ([:foo:]) can also be negated ([:^foo:]).
11475    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11476    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11477    but trigger failures because they are currently unimplemented. */
11478
11479 #define POSIXCC_DONE(c)   ((c) == ':')
11480 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11481 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11482
11483 PERL_STATIC_INLINE I32
11484 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11485 {
11486     dVAR;
11487     I32 namedclass = OOB_NAMEDCLASS;
11488
11489     PERL_ARGS_ASSERT_REGPPOSIXCC;
11490
11491     if (value == '[' && RExC_parse + 1 < RExC_end &&
11492         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11493         POSIXCC(UCHARAT(RExC_parse)))
11494     {
11495         const char c = UCHARAT(RExC_parse);
11496         char* const s = RExC_parse++;
11497
11498         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11499             RExC_parse++;
11500         if (RExC_parse == RExC_end) {
11501             if (strict) {
11502
11503                 /* Try to give a better location for the error (than the end of
11504                  * the string) by looking for the matching ']' */
11505                 RExC_parse = s;
11506                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11507                     RExC_parse++;
11508                 }
11509                 vFAIL2("Unmatched '%c' in POSIX class", c);
11510             }
11511             /* Grandfather lone [:, [=, [. */
11512             RExC_parse = s;
11513         }
11514         else {
11515             const char* const t = RExC_parse++; /* skip over the c */
11516             assert(*t == c);
11517
11518             if (UCHARAT(RExC_parse) == ']') {
11519                 const char *posixcc = s + 1;
11520                 RExC_parse++; /* skip over the ending ] */
11521
11522                 if (*s == ':') {
11523                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11524                     const I32 skip = t - posixcc;
11525
11526                     /* Initially switch on the length of the name.  */
11527                     switch (skip) {
11528                     case 4:
11529                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11530                                                           this is the Perl \w
11531                                                         */
11532                             namedclass = ANYOF_WORDCHAR;
11533                         break;
11534                     case 5:
11535                         /* Names all of length 5.  */
11536                         /* alnum alpha ascii blank cntrl digit graph lower
11537                            print punct space upper  */
11538                         /* Offset 4 gives the best switch position.  */
11539                         switch (posixcc[4]) {
11540                         case 'a':
11541                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11542                                 namedclass = ANYOF_ALPHA;
11543                             break;
11544                         case 'e':
11545                             if (memEQ(posixcc, "spac", 4)) /* space */
11546                                 namedclass = ANYOF_PSXSPC;
11547                             break;
11548                         case 'h':
11549                             if (memEQ(posixcc, "grap", 4)) /* graph */
11550                                 namedclass = ANYOF_GRAPH;
11551                             break;
11552                         case 'i':
11553                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11554                                 namedclass = ANYOF_ASCII;
11555                             break;
11556                         case 'k':
11557                             if (memEQ(posixcc, "blan", 4)) /* blank */
11558                                 namedclass = ANYOF_BLANK;
11559                             break;
11560                         case 'l':
11561                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11562                                 namedclass = ANYOF_CNTRL;
11563                             break;
11564                         case 'm':
11565                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11566                                 namedclass = ANYOF_ALPHANUMERIC;
11567                             break;
11568                         case 'r':
11569                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11570                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11571                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11572                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11573                             break;
11574                         case 't':
11575                             if (memEQ(posixcc, "digi", 4)) /* digit */
11576                                 namedclass = ANYOF_DIGIT;
11577                             else if (memEQ(posixcc, "prin", 4)) /* print */
11578                                 namedclass = ANYOF_PRINT;
11579                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11580                                 namedclass = ANYOF_PUNCT;
11581                             break;
11582                         }
11583                         break;
11584                     case 6:
11585                         if (memEQ(posixcc, "xdigit", 6))
11586                             namedclass = ANYOF_XDIGIT;
11587                         break;
11588                     }
11589
11590                     if (namedclass == OOB_NAMEDCLASS)
11591                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11592                                       t - s - 1, s + 1);
11593
11594                     /* The #defines are structured so each complement is +1 to
11595                      * the normal one */
11596                     if (complement) {
11597                         namedclass++;
11598                     }
11599                     assert (posixcc[skip] == ':');
11600                     assert (posixcc[skip+1] == ']');
11601                 } else if (!SIZE_ONLY) {
11602                     /* [[=foo=]] and [[.foo.]] are still future. */
11603
11604                     /* adjust RExC_parse so the warning shows after
11605                        the class closes */
11606                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11607                         RExC_parse++;
11608                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11609                 }
11610             } else {
11611                 /* Maternal grandfather:
11612                  * "[:" ending in ":" but not in ":]" */
11613                 if (strict) {
11614                     vFAIL("Unmatched '[' in POSIX class");
11615                 }
11616
11617                 /* Grandfather lone [:, [=, [. */
11618                 RExC_parse = s;
11619             }
11620         }
11621     }
11622
11623     return namedclass;
11624 }
11625
11626 STATIC bool
11627 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11628 {
11629     /* This applies some heuristics at the current parse position (which should
11630      * be at a '[') to see if what follows might be intended to be a [:posix:]
11631      * class.  It returns true if it really is a posix class, of course, but it
11632      * also can return true if it thinks that what was intended was a posix
11633      * class that didn't quite make it.
11634      *
11635      * It will return true for
11636      *      [:alphanumerics:
11637      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11638      *                         ')' indicating the end of the (?[
11639      *      [:any garbage including %^&$ punctuation:]
11640      *
11641      * This is designed to be called only from S_handle_regex_sets; it could be
11642      * easily adapted to be called from the spot at the beginning of regclass()
11643      * that checks to see in a normal bracketed class if the surrounding []
11644      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11645      * change long-standing behavior, so I (khw) didn't do that */
11646     char* p = RExC_parse + 1;
11647     char first_char = *p;
11648
11649     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11650
11651     assert(*(p - 1) == '[');
11652
11653     if (! POSIXCC(first_char)) {
11654         return FALSE;
11655     }
11656
11657     p++;
11658     while (p < RExC_end && isWORDCHAR(*p)) p++;
11659
11660     if (p >= RExC_end) {
11661         return FALSE;
11662     }
11663
11664     if (p - RExC_parse > 2    /* Got at least 1 word character */
11665         && (*p == first_char
11666             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11667     {
11668         return TRUE;
11669     }
11670
11671     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11672
11673     return (p
11674             && p - RExC_parse > 2 /* [:] evaluates to colon;
11675                                       [::] is a bad posix class. */
11676             && first_char == *(p - 1));
11677 }
11678
11679 STATIC regnode *
11680 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11681                    char * const oregcomp_parse)
11682 {
11683     /* Handle the (?[...]) construct to do set operations */
11684
11685     U8 curchar;
11686     UV start, end;      /* End points of code point ranges */
11687     SV* result_string;
11688     char *save_end, *save_parse;
11689     SV* final;
11690     STRLEN len;
11691     regnode* node;
11692     AV* stack;
11693     const bool save_fold = FOLD;
11694
11695     GET_RE_DEBUG_FLAGS_DECL;
11696
11697     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11698
11699     if (LOC) {
11700         vFAIL("(?[...]) not valid in locale");
11701     }
11702     RExC_uni_semantics = 1;
11703
11704     /* This will return only an ANYOF regnode, or (unlikely) something smaller
11705      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11706      * call regclass to handle '[]' so as to not have to reinvent its parsing
11707      * rules here (throwing away the size it computes each time).  And, we exit
11708      * upon an unescaped ']' that isn't one ending a regclass.  To do both
11709      * these things, we need to realize that something preceded by a backslash
11710      * is escaped, so we have to keep track of backslashes */
11711     if (SIZE_ONLY) {
11712         UV depth = 0; /* how many nested (?[...]) constructs */
11713
11714         Perl_ck_warner_d(aTHX_
11715             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11716             "The regex_sets feature is experimental" REPORT_LOCATION,
11717             (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11718
11719         while (RExC_parse < RExC_end) {
11720             SV* current = NULL;
11721             RExC_parse = regpatws(pRExC_state, RExC_parse,
11722                                 TRUE); /* means recognize comments */
11723             switch (*RExC_parse) {
11724                 case '?':
11725                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
11726                     /* FALL THROUGH */
11727                 default:
11728                     break;
11729                 case '\\':
11730                     /* Skip the next byte (which could cause us to end up in
11731                      * the middle of a UTF-8 character, but since none of those
11732                      * are confusable with anything we currently handle in this
11733                      * switch (invariants all), it's safe.  We'll just hit the
11734                      * default: case next time and keep on incrementing until
11735                      * we find one of the invariants we do handle. */
11736                     RExC_parse++;
11737                     break;
11738                 case '[':
11739                 {
11740                     /* If this looks like it is a [:posix:] class, leave the
11741                      * parse pointer at the '[' to fool regclass() into
11742                      * thinking it is part of a '[[:posix:]]'.  That function
11743                      * will use strict checking to force a syntax error if it
11744                      * doesn't work out to a legitimate class */
11745                     bool is_posix_class
11746                                     = could_it_be_a_POSIX_class(pRExC_state);
11747                     if (! is_posix_class) {
11748                         RExC_parse++;
11749                     }
11750
11751                     /* regclass() can only return RESTART_UTF8 if multi-char
11752                        folds are allowed.  */
11753                     if (!regclass(pRExC_state, flagp,depth+1,
11754                                   is_posix_class, /* parse the whole char
11755                                                      class only if not a
11756                                                      posix class */
11757                                   FALSE, /* don't allow multi-char folds */
11758                                   TRUE, /* silence non-portable warnings. */
11759                                   &current))
11760                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11761                               (UV) *flagp);
11762
11763                     /* function call leaves parse pointing to the ']', except
11764                      * if we faked it */
11765                     if (is_posix_class) {
11766                         RExC_parse--;
11767                     }
11768
11769                     SvREFCNT_dec(current);   /* In case it returned something */
11770                     break;
11771                 }
11772
11773                 case ']':
11774                     if (depth--) break;
11775                     RExC_parse++;
11776                     if (RExC_parse < RExC_end
11777                         && *RExC_parse == ')')
11778                     {
11779                         node = reganode(pRExC_state, ANYOF, 0);
11780                         RExC_size += ANYOF_SKIP;
11781                         nextchar(pRExC_state);
11782                         Set_Node_Length(node,
11783                                 RExC_parse - oregcomp_parse + 1); /* MJD */
11784                         return node;
11785                     }
11786                     goto no_close;
11787             }
11788             RExC_parse++;
11789         }
11790
11791         no_close:
11792         FAIL("Syntax error in (?[...])");
11793     }
11794
11795     /* Pass 2 only after this.  Everything in this construct is a
11796      * metacharacter.  Operands begin with either a '\' (for an escape
11797      * sequence), or a '[' for a bracketed character class.  Any other
11798      * character should be an operator, or parenthesis for grouping.  Both
11799      * types of operands are handled by calling regclass() to parse them.  It
11800      * is called with a parameter to indicate to return the computed inversion
11801      * list.  The parsing here is implemented via a stack.  Each entry on the
11802      * stack is a single character representing one of the operators, or the
11803      * '('; or else a pointer to an operand inversion list. */
11804
11805 #define IS_OPERAND(a)  (! SvIOK(a))
11806
11807     /* The stack starts empty.  It is a syntax error if the first thing parsed
11808      * is a binary operator; everything else is pushed on the stack.  When an
11809      * operand is parsed, the top of the stack is examined.  If it is a binary
11810      * operator, the item before it should be an operand, and both are replaced
11811      * by the result of doing that operation on the new operand and the one on
11812      * the stack.   Thus a sequence of binary operands is reduced to a single
11813      * one before the next one is parsed.
11814      *
11815      * A unary operator may immediately follow a binary in the input, for
11816      * example
11817      *      [a] + ! [b]
11818      * When an operand is parsed and the top of the stack is a unary operator,
11819      * the operation is performed, and then the stack is rechecked to see if
11820      * this new operand is part of a binary operation; if so, it is handled as
11821      * above.
11822      *
11823      * A '(' is simply pushed on the stack; it is valid only if the stack is
11824      * empty, or the top element of the stack is an operator or another '('
11825      * (for which the parenthesized expression will become an operand).  By the
11826      * time the corresponding ')' is parsed everything in between should have
11827      * been parsed and evaluated to a single operand (or else is a syntax
11828      * error), and is handled as a regular operand */
11829
11830     sv_2mortal((SV *)(stack = newAV()));
11831
11832     while (RExC_parse < RExC_end) {
11833         I32 top_index = av_tindex(stack);
11834         SV** top_ptr;
11835         SV* current = NULL;
11836
11837         /* Skip white space */
11838         RExC_parse = regpatws(pRExC_state, RExC_parse,
11839                                 TRUE); /* means recognize comments */
11840         if (RExC_parse >= RExC_end) {
11841             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11842         }
11843         if ((curchar = UCHARAT(RExC_parse)) == ']') {
11844             break;
11845         }
11846
11847         switch (curchar) {
11848
11849             case '?':
11850                 if (av_tindex(stack) >= 0   /* This makes sure that we can
11851                                                safely subtract 1 from
11852                                                RExC_parse in the next clause.
11853                                                If we have something on the
11854                                                stack, we have parsed something
11855                                              */
11856                     && UCHARAT(RExC_parse - 1) == '('
11857                     && RExC_parse < RExC_end)
11858                 {
11859                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11860                      * This happens when we have some thing like
11861                      *
11862                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11863                      *   ...
11864                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11865                      *
11866                      * Here we would be handling the interpolated
11867                      * '$thai_or_lao'.  We handle this by a recursive call to
11868                      * ourselves which returns the inversion list the
11869                      * interpolated expression evaluates to.  We use the flags
11870                      * from the interpolated pattern. */
11871                     U32 save_flags = RExC_flags;
11872                     const char * const save_parse = ++RExC_parse;
11873
11874                     parse_lparen_question_flags(pRExC_state);
11875
11876                     if (RExC_parse == save_parse  /* Makes sure there was at
11877                                                      least one flag (or this
11878                                                      embedding wasn't compiled)
11879                                                    */
11880                         || RExC_parse >= RExC_end - 4
11881                         || UCHARAT(RExC_parse) != ':'
11882                         || UCHARAT(++RExC_parse) != '('
11883                         || UCHARAT(++RExC_parse) != '?'
11884                         || UCHARAT(++RExC_parse) != '[')
11885                     {
11886
11887                         /* In combination with the above, this moves the
11888                          * pointer to the point just after the first erroneous
11889                          * character (or if there are no flags, to where they
11890                          * should have been) */
11891                         if (RExC_parse >= RExC_end - 4) {
11892                             RExC_parse = RExC_end;
11893                         }
11894                         else if (RExC_parse != save_parse) {
11895                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11896                         }
11897                         vFAIL("Expecting '(?flags:(?[...'");
11898                     }
11899                     RExC_parse++;
11900                     (void) handle_regex_sets(pRExC_state, &current, flagp,
11901                                                     depth+1, oregcomp_parse);
11902
11903                     /* Here, 'current' contains the embedded expression's
11904                      * inversion list, and RExC_parse points to the trailing
11905                      * ']'; the next character should be the ')' which will be
11906                      * paired with the '(' that has been put on the stack, so
11907                      * the whole embedded expression reduces to '(operand)' */
11908                     RExC_parse++;
11909
11910                     RExC_flags = save_flags;
11911                     goto handle_operand;
11912                 }
11913                 /* FALL THROUGH */
11914
11915             default:
11916                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11917                 vFAIL("Unexpected character");
11918
11919             case '\\':
11920                 /* regclass() can only return RESTART_UTF8 if multi-char
11921                    folds are allowed.  */
11922                 if (!regclass(pRExC_state, flagp,depth+1,
11923                               TRUE, /* means parse just the next thing */
11924                               FALSE, /* don't allow multi-char folds */
11925                               FALSE, /* don't silence non-portable warnings.  */
11926                               &current))
11927                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11928                           (UV) *flagp);
11929                 /* regclass() will return with parsing just the \ sequence,
11930                  * leaving the parse pointer at the next thing to parse */
11931                 RExC_parse--;
11932                 goto handle_operand;
11933
11934             case '[':   /* Is a bracketed character class */
11935             {
11936                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11937
11938                 if (! is_posix_class) {
11939                     RExC_parse++;
11940                 }
11941
11942                 /* regclass() can only return RESTART_UTF8 if multi-char
11943                    folds are allowed.  */
11944                 if(!regclass(pRExC_state, flagp,depth+1,
11945                              is_posix_class, /* parse the whole char class
11946                                                 only if not a posix class */
11947                              FALSE, /* don't allow multi-char folds */
11948                              FALSE, /* don't silence non-portable warnings.  */
11949                              &current))
11950                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11951                           (UV) *flagp);
11952                 /* function call leaves parse pointing to the ']', except if we
11953                  * faked it */
11954                 if (is_posix_class) {
11955                     RExC_parse--;
11956                 }
11957
11958                 goto handle_operand;
11959             }
11960
11961             case '&':
11962             case '|':
11963             case '+':
11964             case '-':
11965             case '^':
11966                 if (top_index < 0
11967                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11968                     || ! IS_OPERAND(*top_ptr))
11969                 {
11970                     RExC_parse++;
11971                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11972                 }
11973                 av_push(stack, newSVuv(curchar));
11974                 break;
11975
11976             case '!':
11977                 av_push(stack, newSVuv(curchar));
11978                 break;
11979
11980             case '(':
11981                 if (top_index >= 0) {
11982                     top_ptr = av_fetch(stack, top_index, FALSE);
11983                     assert(top_ptr);
11984                     if (IS_OPERAND(*top_ptr)) {
11985                         RExC_parse++;
11986                         vFAIL("Unexpected '(' with no preceding operator");
11987                     }
11988                 }
11989                 av_push(stack, newSVuv(curchar));
11990                 break;
11991
11992             case ')':
11993             {
11994                 SV* lparen;
11995                 if (top_index < 1
11996                     || ! (current = av_pop(stack))
11997                     || ! IS_OPERAND(current)
11998                     || ! (lparen = av_pop(stack))
11999                     || IS_OPERAND(lparen)
12000                     || SvUV(lparen) != '(')
12001                 {
12002                     SvREFCNT_dec(current);
12003                     RExC_parse++;
12004                     vFAIL("Unexpected ')'");
12005                 }
12006                 top_index -= 2;
12007                 SvREFCNT_dec_NN(lparen);
12008
12009                 /* FALL THROUGH */
12010             }
12011
12012               handle_operand:
12013
12014                 /* Here, we have an operand to process, in 'current' */
12015
12016                 if (top_index < 0) {    /* Just push if stack is empty */
12017                     av_push(stack, current);
12018                 }
12019                 else {
12020                     SV* top = av_pop(stack);
12021                     SV *prev = NULL;
12022                     char current_operator;
12023
12024                     if (IS_OPERAND(top)) {
12025                         SvREFCNT_dec_NN(top);
12026                         SvREFCNT_dec_NN(current);
12027                         vFAIL("Operand with no preceding operator");
12028                     }
12029                     current_operator = (char) SvUV(top);
12030                     switch (current_operator) {
12031                         case '(':   /* Push the '(' back on followed by the new
12032                                        operand */
12033                             av_push(stack, top);
12034                             av_push(stack, current);
12035                             SvREFCNT_inc(top);  /* Counters the '_dec' done
12036                                                    just after the 'break', so
12037                                                    it doesn't get wrongly freed
12038                                                  */
12039                             break;
12040
12041                         case '!':
12042                             _invlist_invert(current);
12043
12044                             /* Unlike binary operators, the top of the stack,
12045                              * now that this unary one has been popped off, may
12046                              * legally be an operator, and we now have operand
12047                              * for it. */
12048                             top_index--;
12049                             SvREFCNT_dec_NN(top);
12050                             goto handle_operand;
12051
12052                         case '&':
12053                             prev = av_pop(stack);
12054                             _invlist_intersection(prev,
12055                                                    current,
12056                                                    &current);
12057                             av_push(stack, current);
12058                             break;
12059
12060                         case '|':
12061                         case '+':
12062                             prev = av_pop(stack);
12063                             _invlist_union(prev, current, &current);
12064                             av_push(stack, current);
12065                             break;
12066
12067                         case '-':
12068                             prev = av_pop(stack);;
12069                             _invlist_subtract(prev, current, &current);
12070                             av_push(stack, current);
12071                             break;
12072
12073                         case '^':   /* The union minus the intersection */
12074                         {
12075                             SV* i = NULL;
12076                             SV* u = NULL;
12077                             SV* element;
12078
12079                             prev = av_pop(stack);
12080                             _invlist_union(prev, current, &u);
12081                             _invlist_intersection(prev, current, &i);
12082                             /* _invlist_subtract will overwrite current
12083                                 without freeing what it already contains */
12084                             element = current;
12085                             _invlist_subtract(u, i, &current);
12086                             av_push(stack, current);
12087                             SvREFCNT_dec_NN(i);
12088                             SvREFCNT_dec_NN(u);
12089                             SvREFCNT_dec_NN(element);
12090                             break;
12091                         }
12092
12093                         default:
12094                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12095                 }
12096                 SvREFCNT_dec_NN(top);
12097                 SvREFCNT_dec(prev);
12098             }
12099         }
12100
12101         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12102     }
12103
12104     if (av_tindex(stack) < 0   /* Was empty */
12105         || ((final = av_pop(stack)) == NULL)
12106         || ! IS_OPERAND(final)
12107         || av_tindex(stack) >= 0)  /* More left on stack */
12108     {
12109         vFAIL("Incomplete expression within '(?[ ])'");
12110     }
12111
12112     /* Here, 'final' is the resultant inversion list from evaluating the
12113      * expression.  Return it if so requested */
12114     if (return_invlist) {
12115         *return_invlist = final;
12116         return END;
12117     }
12118
12119     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12120      * expecting a string of ranges and individual code points */
12121     invlist_iterinit(final);
12122     result_string = newSVpvs("");
12123     while (invlist_iternext(final, &start, &end)) {
12124         if (start == end) {
12125             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12126         }
12127         else {
12128             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12129                                                      start,          end);
12130         }
12131     }
12132
12133     save_parse = RExC_parse;
12134     RExC_parse = SvPV(result_string, len);
12135     save_end = RExC_end;
12136     RExC_end = RExC_parse + len;
12137
12138     /* We turn off folding around the call, as the class we have constructed
12139      * already has all folding taken into consideration, and we don't want
12140      * regclass() to add to that */
12141     RExC_flags &= ~RXf_PMf_FOLD;
12142     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12143      */
12144     node = regclass(pRExC_state, flagp,depth+1,
12145                     FALSE, /* means parse the whole char class */
12146                     FALSE, /* don't allow multi-char folds */
12147                     TRUE, /* silence non-portable warnings.  The above may very
12148                              well have generated non-portable code points, but
12149                              they're valid on this machine */
12150                     NULL);
12151     if (!node)
12152         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12153                     PTR2UV(flagp));
12154     if (save_fold) {
12155         RExC_flags |= RXf_PMf_FOLD;
12156     }
12157     RExC_parse = save_parse + 1;
12158     RExC_end = save_end;
12159     SvREFCNT_dec_NN(final);
12160     SvREFCNT_dec_NN(result_string);
12161
12162     nextchar(pRExC_state);
12163     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12164     return node;
12165 }
12166 #undef IS_OPERAND
12167
12168 /* The names of properties whose definitions are not known at compile time are
12169  * stored in this SV, after a constant heading.  So if the length has been
12170  * changed since initialization, then there is a run-time definition. */
12171 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12172
12173 STATIC regnode *
12174 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12175                  const bool stop_at_1,  /* Just parse the next thing, don't
12176                                            look for a full character class */
12177                  bool allow_multi_folds,
12178                  const bool silence_non_portable,   /* Don't output warnings
12179                                                        about too large
12180                                                        characters */
12181                  SV** ret_invlist)  /* Return an inversion list, not a node */
12182 {
12183     /* parse a bracketed class specification.  Most of these will produce an
12184      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12185      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12186      * under /i with multi-character folds: it will be rewritten following the
12187      * paradigm of this example, where the <multi-fold>s are characters which
12188      * fold to multiple character sequences:
12189      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12190      * gets effectively rewritten as:
12191      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12192      * reg() gets called (recursively) on the rewritten version, and this
12193      * function will return what it constructs.  (Actually the <multi-fold>s
12194      * aren't physically removed from the [abcdefghi], it's just that they are
12195      * ignored in the recursion by means of a flag:
12196      * <RExC_in_multi_char_class>.)
12197      *
12198      * ANYOF nodes contain a bit map for the first 256 characters, with the
12199      * corresponding bit set if that character is in the list.  For characters
12200      * above 255, a range list or swash is used.  There are extra bits for \w,
12201      * etc. in locale ANYOFs, as what these match is not determinable at
12202      * compile time
12203      *
12204      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12205      * to be restarted.  This can only happen if ret_invlist is non-NULL.
12206      */
12207
12208     dVAR;
12209     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12210     IV range = 0;
12211     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12212     regnode *ret;
12213     STRLEN numlen;
12214     IV namedclass = OOB_NAMEDCLASS;
12215     char *rangebegin = NULL;
12216     bool need_class = 0;
12217     SV *listsv = NULL;
12218     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12219                                       than just initialized.  */
12220     SV* properties = NULL;    /* Code points that match \p{} \P{} */
12221     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12222                                extended beyond the Latin1 range */
12223     UV element_count = 0;   /* Number of distinct elements in the class.
12224                                Optimizations may be possible if this is tiny */
12225     AV * multi_char_matches = NULL; /* Code points that fold to more than one
12226                                        character; used under /i */
12227     UV n;
12228     char * stop_ptr = RExC_end;    /* where to stop parsing */
12229     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12230                                                    space? */
12231     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12232
12233     /* Unicode properties are stored in a swash; this holds the current one
12234      * being parsed.  If this swash is the only above-latin1 component of the
12235      * character class, an optimization is to pass it directly on to the
12236      * execution engine.  Otherwise, it is set to NULL to indicate that there
12237      * are other things in the class that have to be dealt with at execution
12238      * time */
12239     SV* swash = NULL;           /* Code points that match \p{} \P{} */
12240
12241     /* Set if a component of this character class is user-defined; just passed
12242      * on to the engine */
12243     bool has_user_defined_property = FALSE;
12244
12245     /* inversion list of code points this node matches only when the target
12246      * string is in UTF-8.  (Because is under /d) */
12247     SV* depends_list = NULL;
12248
12249     /* inversion list of code points this node matches.  For much of the
12250      * function, it includes only those that match regardless of the utf8ness
12251      * of the target string */
12252     SV* cp_list = NULL;
12253
12254 #ifdef EBCDIC
12255     /* In a range, counts how many 0-2 of the ends of it came from literals,
12256      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12257     UV literal_endpoint = 0;
12258 #endif
12259     bool invert = FALSE;    /* Is this class to be complemented */
12260
12261     /* Is there any thing like \W or [:^digit:] that matches above the legal
12262      * Unicode range? */
12263     bool runtime_posix_matches_above_Unicode = FALSE;
12264
12265     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12266         case we need to change the emitted regop to an EXACT. */
12267     const char * orig_parse = RExC_parse;
12268     const I32 orig_size = RExC_size;
12269     GET_RE_DEBUG_FLAGS_DECL;
12270
12271     PERL_ARGS_ASSERT_REGCLASS;
12272 #ifndef DEBUGGING
12273     PERL_UNUSED_ARG(depth);
12274 #endif
12275
12276     DEBUG_PARSE("clas");
12277
12278     /* Assume we are going to generate an ANYOF node. */
12279     ret = reganode(pRExC_state, ANYOF, 0);
12280
12281     if (SIZE_ONLY) {
12282         RExC_size += ANYOF_SKIP;
12283         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12284     }
12285     else {
12286         ANYOF_FLAGS(ret) = 0;
12287
12288         RExC_emit += ANYOF_SKIP;
12289         if (LOC) {
12290             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12291         }
12292         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12293         initial_listsv_len = SvCUR(listsv);
12294         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12295     }
12296
12297     if (skip_white) {
12298         RExC_parse = regpatws(pRExC_state, RExC_parse,
12299                               FALSE /* means don't recognize comments */);
12300     }
12301
12302     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12303         RExC_parse++;
12304         invert = TRUE;
12305         allow_multi_folds = FALSE;
12306         RExC_naughty++;
12307         if (skip_white) {
12308             RExC_parse = regpatws(pRExC_state, RExC_parse,
12309                                   FALSE /* means don't recognize comments */);
12310         }
12311     }
12312
12313     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12314     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12315         const char *s = RExC_parse;
12316         const char  c = *s++;
12317
12318         while (isWORDCHAR(*s))
12319             s++;
12320         if (*s && c == *s && s[1] == ']') {
12321             SAVEFREESV(RExC_rx_sv);
12322             ckWARN3reg(s+2,
12323                        "POSIX syntax [%c %c] belongs inside character classes",
12324                        c, c);
12325             (void)ReREFCNT_inc(RExC_rx_sv);
12326         }
12327     }
12328
12329     /* If the caller wants us to just parse a single element, accomplish this
12330      * by faking the loop ending condition */
12331     if (stop_at_1 && RExC_end > RExC_parse) {
12332         stop_ptr = RExC_parse + 1;
12333     }
12334
12335     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12336     if (UCHARAT(RExC_parse) == ']')
12337         goto charclassloop;
12338
12339 parseit:
12340     while (1) {
12341         if  (RExC_parse >= stop_ptr) {
12342             break;
12343         }
12344
12345         if (skip_white) {
12346             RExC_parse = regpatws(pRExC_state, RExC_parse,
12347                                   FALSE /* means don't recognize comments */);
12348         }
12349
12350         if  (UCHARAT(RExC_parse) == ']') {
12351             break;
12352         }
12353
12354     charclassloop:
12355
12356         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12357         save_value = value;
12358         save_prevvalue = prevvalue;
12359
12360         if (!range) {
12361             rangebegin = RExC_parse;
12362             element_count++;
12363         }
12364         if (UTF) {
12365             value = utf8n_to_uvchr((U8*)RExC_parse,
12366                                    RExC_end - RExC_parse,
12367                                    &numlen, UTF8_ALLOW_DEFAULT);
12368             RExC_parse += numlen;
12369         }
12370         else
12371             value = UCHARAT(RExC_parse++);
12372
12373         if (value == '['
12374             && RExC_parse < RExC_end
12375             && POSIXCC(UCHARAT(RExC_parse)))
12376         {
12377             namedclass = regpposixcc(pRExC_state, value, strict);
12378         }
12379         else if (value == '\\') {
12380             if (UTF) {
12381                 value = utf8n_to_uvchr((U8*)RExC_parse,
12382                                    RExC_end - RExC_parse,
12383                                    &numlen, UTF8_ALLOW_DEFAULT);
12384                 RExC_parse += numlen;
12385             }
12386             else
12387                 value = UCHARAT(RExC_parse++);
12388
12389             /* Some compilers cannot handle switching on 64-bit integer
12390              * values, therefore value cannot be an UV.  Yes, this will
12391              * be a problem later if we want switch on Unicode.
12392              * A similar issue a little bit later when switching on
12393              * namedclass. --jhi */
12394
12395             /* If the \ is escaping white space when white space is being
12396              * skipped, it means that that white space is wanted literally, and
12397              * is already in 'value'.  Otherwise, need to translate the escape
12398              * into what it signifies. */
12399             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12400
12401             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12402             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12403             case 's':   namedclass = ANYOF_SPACE;       break;
12404             case 'S':   namedclass = ANYOF_NSPACE;      break;
12405             case 'd':   namedclass = ANYOF_DIGIT;       break;
12406             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12407             case 'v':   namedclass = ANYOF_VERTWS;      break;
12408             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12409             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12410             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12411             case 'N':  /* Handle \N{NAME} in class */
12412                 {
12413                     /* We only pay attention to the first char of
12414                     multichar strings being returned. I kinda wonder
12415                     if this makes sense as it does change the behaviour
12416                     from earlier versions, OTOH that behaviour was broken
12417                     as well. */
12418                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12419                                       TRUE, /* => charclass */
12420                                       strict))
12421                     {
12422                         if (*flagp & RESTART_UTF8)
12423                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
12424                         goto parseit;
12425                     }
12426                 }
12427                 break;
12428             case 'p':
12429             case 'P':
12430                 {
12431                 char *e;
12432
12433                 /* We will handle any undefined properties ourselves */
12434                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12435
12436                 if (RExC_parse >= RExC_end)
12437                     vFAIL2("Empty \\%c{}", (U8)value);
12438                 if (*RExC_parse == '{') {
12439                     const U8 c = (U8)value;
12440                     e = strchr(RExC_parse++, '}');
12441                     if (!e)
12442                         vFAIL2("Missing right brace on \\%c{}", c);
12443                     while (isSPACE(UCHARAT(RExC_parse)))
12444                         RExC_parse++;
12445                     if (e == RExC_parse)
12446                         vFAIL2("Empty \\%c{}", c);
12447                     n = e - RExC_parse;
12448                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12449                         n--;
12450                 }
12451                 else {
12452                     e = RExC_parse;
12453                     n = 1;
12454                 }
12455                 if (!SIZE_ONLY) {
12456                     SV* invlist;
12457                     char* name;
12458
12459                     if (UCHARAT(RExC_parse) == '^') {
12460                          RExC_parse++;
12461                          n--;
12462                          /* toggle.  (The rhs xor gets the single bit that
12463                           * differs between P and p; the other xor inverts just
12464                           * that bit) */
12465                          value ^= 'P' ^ 'p';
12466
12467                          while (isSPACE(UCHARAT(RExC_parse))) {
12468                               RExC_parse++;
12469                               n--;
12470                          }
12471                     }
12472                     /* Try to get the definition of the property into
12473                      * <invlist>.  If /i is in effect, the effective property
12474                      * will have its name be <__NAME_i>.  The design is
12475                      * discussed in commit
12476                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12477                     Newx(name, n + sizeof("_i__\n"), char);
12478
12479                     sprintf(name, "%s%.*s%s\n",
12480                                     (FOLD) ? "__" : "",
12481                                     (int)n,
12482                                     RExC_parse,
12483                                     (FOLD) ? "_i" : ""
12484                     );
12485
12486                     /* Look up the property name, and get its swash and
12487                      * inversion list, if the property is found  */
12488                     if (swash) {
12489                         SvREFCNT_dec_NN(swash);
12490                     }
12491                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12492                                              1, /* binary */
12493                                              0, /* not tr/// */
12494                                              NULL, /* No inversion list */
12495                                              &swash_init_flags
12496                                             );
12497                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12498                         if (swash) {
12499                             SvREFCNT_dec_NN(swash);
12500                             swash = NULL;
12501                         }
12502
12503                         /* Here didn't find it.  It could be a user-defined
12504                          * property that will be available at run-time.  If we
12505                          * accept only compile-time properties, is an error;
12506                          * otherwise add it to the list for run-time look up */
12507                         if (ret_invlist) {
12508                             RExC_parse = e + 1;
12509                             vFAIL3("Property '%.*s' is unknown", (int) n, name);
12510                         }
12511                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12512                                         (value == 'p' ? '+' : '!'),
12513                                         name);
12514                         has_user_defined_property = TRUE;
12515
12516                         /* We don't know yet, so have to assume that the
12517                          * property could match something in the Latin1 range,
12518                          * hence something that isn't utf8.  Note that this
12519                          * would cause things in <depends_list> to match
12520                          * inappropriately, except that any \p{}, including
12521                          * this one forces Unicode semantics, which means there
12522                          * is <no depends_list> */
12523                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12524                     }
12525                     else {
12526
12527                         /* Here, did get the swash and its inversion list.  If
12528                          * the swash is from a user-defined property, then this
12529                          * whole character class should be regarded as such */
12530                         has_user_defined_property =
12531                                     (swash_init_flags
12532                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12533
12534                         /* Invert if asking for the complement */
12535                         if (value == 'P') {
12536                             _invlist_union_complement_2nd(properties,
12537                                                           invlist,
12538                                                           &properties);
12539
12540                             /* The swash can't be used as-is, because we've
12541                              * inverted things; delay removing it to here after
12542                              * have copied its invlist above */
12543                             SvREFCNT_dec_NN(swash);
12544                             swash = NULL;
12545                         }
12546                         else {
12547                             _invlist_union(properties, invlist, &properties);
12548                         }
12549                     }
12550                     Safefree(name);
12551                 }
12552                 RExC_parse = e + 1;
12553                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12554                                                 named */
12555
12556                 /* \p means they want Unicode semantics */
12557                 RExC_uni_semantics = 1;
12558                 }
12559                 break;
12560             case 'n':   value = '\n';                   break;
12561             case 'r':   value = '\r';                   break;
12562             case 't':   value = '\t';                   break;
12563             case 'f':   value = '\f';                   break;
12564             case 'b':   value = '\b';                   break;
12565             case 'e':   value = ASCII_TO_NATIVE('\033');break;
12566             case 'a':   value = ASCII_TO_NATIVE('\007');break;
12567             case 'o':
12568                 RExC_parse--;   /* function expects to be pointed at the 'o' */
12569                 {
12570                     const char* error_msg;
12571                     bool valid = grok_bslash_o(&RExC_parse,
12572                                                &value,
12573                                                &error_msg,
12574                                                SIZE_ONLY,   /* warnings in pass
12575                                                                1 only */
12576                                                strict,
12577                                                silence_non_portable,
12578                                                UTF);
12579                     if (! valid) {
12580                         vFAIL(error_msg);
12581                     }
12582                 }
12583                 if (PL_encoding && value < 0x100) {
12584                     goto recode_encoding;
12585                 }
12586                 break;
12587             case 'x':
12588                 RExC_parse--;   /* function expects to be pointed at the 'x' */
12589                 {
12590                     const char* error_msg;
12591                     bool valid = grok_bslash_x(&RExC_parse,
12592                                                &value,
12593                                                &error_msg,
12594                                                TRUE, /* Output warnings */
12595                                                strict,
12596                                                silence_non_portable,
12597                                                UTF);
12598                     if (! valid) {
12599                         vFAIL(error_msg);
12600                     }
12601                 }
12602                 if (PL_encoding && value < 0x100)
12603                     goto recode_encoding;
12604                 break;
12605             case 'c':
12606                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12607                 break;
12608             case '0': case '1': case '2': case '3': case '4':
12609             case '5': case '6': case '7':
12610                 {
12611                     /* Take 1-3 octal digits */
12612                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12613                     numlen = (strict) ? 4 : 3;
12614                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12615                     RExC_parse += numlen;
12616                     if (numlen != 3) {
12617                         if (strict) {
12618                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12619                             vFAIL("Need exactly 3 octal digits");
12620                         }
12621                         else if (! SIZE_ONLY /* like \08, \178 */
12622                                  && numlen < 3
12623                                  && RExC_parse < RExC_end
12624                                  && isDIGIT(*RExC_parse)
12625                                  && ckWARN(WARN_REGEXP))
12626                         {
12627                             SAVEFREESV(RExC_rx_sv);
12628                             reg_warn_non_literal_string(
12629                                  RExC_parse + 1,
12630                                  form_short_octal_warning(RExC_parse, numlen));
12631                             (void)ReREFCNT_inc(RExC_rx_sv);
12632                         }
12633                     }
12634                     if (PL_encoding && value < 0x100)
12635                         goto recode_encoding;
12636                     break;
12637                 }
12638             recode_encoding:
12639                 if (! RExC_override_recoding) {
12640                     SV* enc = PL_encoding;
12641                     value = reg_recode((const char)(U8)value, &enc);
12642                     if (!enc) {
12643                         if (strict) {
12644                             vFAIL("Invalid escape in the specified encoding");
12645                         }
12646                         else if (SIZE_ONLY) {
12647                             ckWARNreg(RExC_parse,
12648                                   "Invalid escape in the specified encoding");
12649                         }
12650                     }
12651                     break;
12652                 }
12653             default:
12654                 /* Allow \_ to not give an error */
12655                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12656                     if (strict) {
12657                         vFAIL2("Unrecognized escape \\%c in character class",
12658                                (int)value);
12659                     }
12660                     else {
12661                         SAVEFREESV(RExC_rx_sv);
12662                         ckWARN2reg(RExC_parse,
12663                             "Unrecognized escape \\%c in character class passed through",
12664                             (int)value);
12665                         (void)ReREFCNT_inc(RExC_rx_sv);
12666                     }
12667                 }
12668                 break;
12669             }   /* End of switch on char following backslash */
12670         } /* end of handling backslash escape sequences */
12671 #ifdef EBCDIC
12672         else
12673             literal_endpoint++;
12674 #endif
12675
12676         /* Here, we have the current token in 'value' */
12677
12678         /* What matches in a locale is not known until runtime.  This includes
12679          * what the Posix classes (like \w, [:space:]) match.  Room must be
12680          * reserved (one time per class) to store such classes, either if Perl
12681          * is compiled so that locale nodes always should have this space, or
12682          * if there is such class info to be stored.  The space will contain a
12683          * bit for each named class that is to be matched against.  This isn't
12684          * needed for \p{} and pseudo-classes, as they are not affected by
12685          * locale, and hence are dealt with separately */
12686         if (LOC
12687             && ! need_class
12688             && (ANYOF_LOCALE == ANYOF_CLASS
12689                 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12690         {
12691             need_class = 1;
12692             if (SIZE_ONLY) {
12693                 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12694             }
12695             else {
12696                 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12697                 ANYOF_CLASS_ZERO(ret);
12698             }
12699             ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12700         }
12701
12702         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12703
12704             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12705              * literal, as is the character that began the false range, i.e.
12706              * the 'a' in the examples */
12707             if (range) {
12708                 if (!SIZE_ONLY) {
12709                     const int w = (RExC_parse >= rangebegin)
12710                                   ? RExC_parse - rangebegin
12711                                   : 0;
12712                     if (strict) {
12713                         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12714                     }
12715                     else {
12716                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12717                         ckWARN4reg(RExC_parse,
12718                                 "False [] range \"%*.*s\"",
12719                                 w, w, rangebegin);
12720                         (void)ReREFCNT_inc(RExC_rx_sv);
12721                         cp_list = add_cp_to_invlist(cp_list, '-');
12722                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12723                     }
12724                 }
12725
12726                 range = 0; /* this was not a true range */
12727                 element_count += 2; /* So counts for three values */
12728             }
12729
12730             if (! SIZE_ONLY) {
12731                 U8 classnum = namedclass_to_classnum(namedclass);
12732                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
12733                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12734
12735                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12736                          * /l make a difference in what these match.  There
12737                          * would be problems if these characters had folds
12738                          * other than themselves, as cp_list is subject to
12739                          * folding. */
12740                         if (classnum != _CC_VERTSPACE) {
12741                             assert(   namedclass == ANYOF_HORIZWS
12742                                    || namedclass == ANYOF_NHORIZWS);
12743
12744                             /* It turns out that \h is just a synonym for
12745                              * XPosixBlank */
12746                             classnum = _CC_BLANK;
12747                         }
12748
12749                         _invlist_union_maybe_complement_2nd(
12750                                 cp_list,
12751                                 PL_XPosix_ptrs[classnum],
12752                                 cBOOL(namedclass % 2), /* Complement if odd
12753                                                           (NHORIZWS, NVERTWS)
12754                                                         */
12755                                 &cp_list);
12756                     }
12757                 }
12758                 else if (classnum == _CC_ASCII) {
12759 #ifdef HAS_ISASCII
12760                     if (LOC) {
12761                         ANYOF_CLASS_SET(ret, namedclass);
12762                     }
12763                     else
12764 #endif  /* Not isascii(); just use the hard-coded definition for it */
12765                         _invlist_union_maybe_complement_2nd(
12766                                 posixes,
12767                                 PL_ASCII,
12768                                 cBOOL(namedclass % 2), /* Complement if odd
12769                                                           (NASCII) */
12770                                 &posixes);
12771                 }
12772                 else {  /* Garden variety class */
12773
12774                     /* The ascii range inversion list */
12775                     SV* ascii_source = PL_Posix_ptrs[classnum];
12776
12777                     /* The full Latin1 range inversion list */
12778                     SV* l1_source = PL_L1Posix_ptrs[classnum];
12779
12780                     /* This code is structured into two major clauses.  The
12781                      * first is for classes whose complete definitions may not
12782                      * already be known.  It not, the Latin1 definition
12783                      * (guaranteed to already known) is used plus code is
12784                      * generated to load the rest at run-time (only if needed).
12785                      * If the complete definition is known, it drops down to
12786                      * the second clause, where the complete definition is
12787                      * known */
12788
12789                     if (classnum < _FIRST_NON_SWASH_CC) {
12790
12791                         /* Here, the class has a swash, which may or not
12792                          * already be loaded */
12793
12794                         /* The name of the property to use to match the full
12795                          * eXtended Unicode range swash for this character
12796                          * class */
12797                         const char *Xname = swash_property_names[classnum];
12798
12799                         /* If returning the inversion list, we can't defer
12800                          * getting this until runtime */
12801                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12802                             PL_utf8_swash_ptrs[classnum] =
12803                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
12804                                              1, /* binary */
12805                                              0, /* not tr/// */
12806                                              NULL, /* No inversion list */
12807                                              NULL  /* No flags */
12808                                             );
12809                             assert(PL_utf8_swash_ptrs[classnum]);
12810                         }
12811                         if ( !  PL_utf8_swash_ptrs[classnum]) {
12812                             if (namedclass % 2 == 0) { /* A non-complemented
12813                                                           class */
12814                                 /* If not /a matching, there are code points we
12815                                  * don't know at compile time.  Arrange for the
12816                                  * unknown matches to be loaded at run-time, if
12817                                  * needed */
12818                                 if (! AT_LEAST_ASCII_RESTRICTED) {
12819                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12820                                                                  Xname);
12821                                 }
12822                                 if (LOC) {  /* Under locale, set run-time
12823                                                lookup */
12824                                     ANYOF_CLASS_SET(ret, namedclass);
12825                                 }
12826                                 else {
12827                                     /* Add the current class's code points to
12828                                      * the running total */
12829                                     _invlist_union(posixes,
12830                                                    (AT_LEAST_ASCII_RESTRICTED)
12831                                                         ? ascii_source
12832                                                         : l1_source,
12833                                                    &posixes);
12834                                 }
12835                             }
12836                             else {  /* A complemented class */
12837                                 if (AT_LEAST_ASCII_RESTRICTED) {
12838                                     /* Under /a should match everything above
12839                                      * ASCII, plus the complement of the set's
12840                                      * ASCII matches */
12841                                     _invlist_union_complement_2nd(posixes,
12842                                                                   ascii_source,
12843                                                                   &posixes);
12844                                 }
12845                                 else {
12846                                     /* Arrange for the unknown matches to be
12847                                      * loaded at run-time, if needed */
12848                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12849                                                                  Xname);
12850                                     runtime_posix_matches_above_Unicode = TRUE;
12851                                     if (LOC) {
12852                                         ANYOF_CLASS_SET(ret, namedclass);
12853                                     }
12854                                     else {
12855
12856                                         /* We want to match everything in
12857                                          * Latin1, except those things that
12858                                          * l1_source matches */
12859                                         SV* scratch_list = NULL;
12860                                         _invlist_subtract(PL_Latin1, l1_source,
12861                                                           &scratch_list);
12862
12863                                         /* Add the list from this class to the
12864                                          * running total */
12865                                         if (! posixes) {
12866                                             posixes = scratch_list;
12867                                         }
12868                                         else {
12869                                             _invlist_union(posixes,
12870                                                            scratch_list,
12871                                                            &posixes);
12872                                             SvREFCNT_dec_NN(scratch_list);
12873                                         }
12874                                         if (DEPENDS_SEMANTICS) {
12875                                             ANYOF_FLAGS(ret)
12876                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
12877                                         }
12878                                     }
12879                                 }
12880                             }
12881                             goto namedclass_done;
12882                         }
12883
12884                         /* Here, there is a swash loaded for the class.  If no
12885                          * inversion list for it yet, get it */
12886                         if (! PL_XPosix_ptrs[classnum]) {
12887                             PL_XPosix_ptrs[classnum]
12888                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12889                         }
12890                     }
12891
12892                     /* Here there is an inversion list already loaded for the
12893                      * entire class */
12894
12895                     if (namedclass % 2 == 0) {  /* A non-complemented class,
12896                                                    like ANYOF_PUNCT */
12897                         if (! LOC) {
12898                             /* For non-locale, just add it to any existing list
12899                              * */
12900                             _invlist_union(posixes,
12901                                            (AT_LEAST_ASCII_RESTRICTED)
12902                                                ? ascii_source
12903                                                : PL_XPosix_ptrs[classnum],
12904                                            &posixes);
12905                         }
12906                         else {  /* Locale */
12907                             SV* scratch_list = NULL;
12908
12909                             /* For above Latin1 code points, we use the full
12910                              * Unicode range */
12911                             _invlist_intersection(PL_AboveLatin1,
12912                                                   PL_XPosix_ptrs[classnum],
12913                                                   &scratch_list);
12914                             /* And set the output to it, adding instead if
12915                              * there already is an output.  Checking if
12916                              * 'posixes' is NULL first saves an extra clone.
12917                              * Its reference count will be decremented at the
12918                              * next union, etc, or if this is the only
12919                              * instance, at the end of the routine */
12920                             if (! posixes) {
12921                                 posixes = scratch_list;
12922                             }
12923                             else {
12924                                 _invlist_union(posixes, scratch_list, &posixes);
12925                                 SvREFCNT_dec_NN(scratch_list);
12926                             }
12927
12928 #ifndef HAS_ISBLANK
12929                             if (namedclass != ANYOF_BLANK) {
12930 #endif
12931                                 /* Set this class in the node for runtime
12932                                  * matching */
12933                                 ANYOF_CLASS_SET(ret, namedclass);
12934 #ifndef HAS_ISBLANK
12935                             }
12936                             else {
12937                                 /* No isblank(), use the hard-coded ASCII-range
12938                                  * blanks, adding them to the running total. */
12939
12940                                 _invlist_union(posixes, ascii_source, &posixes);
12941                             }
12942 #endif
12943                         }
12944                     }
12945                     else {  /* A complemented class, like ANYOF_NPUNCT */
12946                         if (! LOC) {
12947                             _invlist_union_complement_2nd(
12948                                                 posixes,
12949                                                 (AT_LEAST_ASCII_RESTRICTED)
12950                                                     ? ascii_source
12951                                                     : PL_XPosix_ptrs[classnum],
12952                                                 &posixes);
12953                             /* Under /d, everything in the upper half of the
12954                              * Latin1 range matches this complement */
12955                             if (DEPENDS_SEMANTICS) {
12956                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12957                             }
12958                         }
12959                         else {  /* Locale */
12960                             SV* scratch_list = NULL;
12961                             _invlist_subtract(PL_AboveLatin1,
12962                                               PL_XPosix_ptrs[classnum],
12963                                               &scratch_list);
12964                             if (! posixes) {
12965                                 posixes = scratch_list;
12966                             }
12967                             else {
12968                                 _invlist_union(posixes, scratch_list, &posixes);
12969                                 SvREFCNT_dec_NN(scratch_list);
12970                             }
12971 #ifndef HAS_ISBLANK
12972                             if (namedclass != ANYOF_NBLANK) {
12973 #endif
12974                                 ANYOF_CLASS_SET(ret, namedclass);
12975 #ifndef HAS_ISBLANK
12976                             }
12977                             else {
12978                                 /* Get the list of all code points in Latin1
12979                                  * that are not ASCII blanks, and add them to
12980                                  * the running total */
12981                                 _invlist_subtract(PL_Latin1, ascii_source,
12982                                                   &scratch_list);
12983                                 _invlist_union(posixes, scratch_list, &posixes);
12984                                 SvREFCNT_dec_NN(scratch_list);
12985                             }
12986 #endif
12987                         }
12988                     }
12989                 }
12990               namedclass_done:
12991                 continue;   /* Go get next character */
12992             }
12993         } /* end of namedclass \blah */
12994
12995         /* Here, we have a single value.  If 'range' is set, it is the ending
12996          * of a range--check its validity.  Later, we will handle each
12997          * individual code point in the range.  If 'range' isn't set, this
12998          * could be the beginning of a range, so check for that by looking
12999          * ahead to see if the next real character to be processed is the range
13000          * indicator--the minus sign */
13001
13002         if (skip_white) {
13003             RExC_parse = regpatws(pRExC_state, RExC_parse,
13004                                 FALSE /* means don't recognize comments */);
13005         }
13006
13007         if (range) {
13008             if (prevvalue > value) /* b-a */ {
13009                 const int w = RExC_parse - rangebegin;
13010                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
13011                 range = 0; /* not a valid range */
13012             }
13013         }
13014         else {
13015             prevvalue = value; /* save the beginning of the potential range */
13016             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13017                 && *RExC_parse == '-')
13018             {
13019                 char* next_char_ptr = RExC_parse + 1;
13020                 if (skip_white) {   /* Get the next real char after the '-' */
13021                     next_char_ptr = regpatws(pRExC_state,
13022                                              RExC_parse + 1,
13023                                              FALSE); /* means don't recognize
13024                                                         comments */
13025                 }
13026
13027                 /* If the '-' is at the end of the class (just before the ']',
13028                  * it is a literal minus; otherwise it is a range */
13029                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13030                     RExC_parse = next_char_ptr;
13031
13032                     /* a bad range like \w-, [:word:]- ? */
13033                     if (namedclass > OOB_NAMEDCLASS) {
13034                         if (strict || ckWARN(WARN_REGEXP)) {
13035                             const int w =
13036                                 RExC_parse >= rangebegin ?
13037                                 RExC_parse - rangebegin : 0;
13038                             if (strict) {
13039                                 vFAIL4("False [] range \"%*.*s\"",
13040                                     w, w, rangebegin);
13041                             }
13042                             else {
13043                                 vWARN4(RExC_parse,
13044                                     "False [] range \"%*.*s\"",
13045                                     w, w, rangebegin);
13046                             }
13047                         }
13048                         if (!SIZE_ONLY) {
13049                             cp_list = add_cp_to_invlist(cp_list, '-');
13050                         }
13051                         element_count++;
13052                     } else
13053                         range = 1;      /* yeah, it's a range! */
13054                     continue;   /* but do it the next time */
13055                 }
13056             }
13057         }
13058
13059         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13060          * if not */
13061
13062         /* non-Latin1 code point implies unicode semantics.  Must be set in
13063          * pass1 so is there for the whole of pass 2 */
13064         if (value > 255) {
13065             RExC_uni_semantics = 1;
13066         }
13067
13068         /* Ready to process either the single value, or the completed range.
13069          * For single-valued non-inverted ranges, we consider the possibility
13070          * of multi-char folds.  (We made a conscious decision to not do this
13071          * for the other cases because it can often lead to non-intuitive
13072          * results.  For example, you have the peculiar case that:
13073          *  "s s" =~ /^[^\xDF]+$/i => Y
13074          *  "ss"  =~ /^[^\xDF]+$/i => N
13075          *
13076          * See [perl #89750] */
13077         if (FOLD && allow_multi_folds && value == prevvalue) {
13078             if (value == LATIN_SMALL_LETTER_SHARP_S
13079                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13080                                                         value)))
13081             {
13082                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13083
13084                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13085                 STRLEN foldlen;
13086
13087                 UV folded = _to_uni_fold_flags(
13088                                 value,
13089                                 foldbuf,
13090                                 &foldlen,
13091                                 FOLD_FLAGS_FULL
13092                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
13093                                             : (ASCII_FOLD_RESTRICTED)
13094                                               ? FOLD_FLAGS_NOMIX_ASCII
13095                                               : 0)
13096                                 );
13097
13098                 /* Here, <folded> should be the first character of the
13099                  * multi-char fold of <value>, with <foldbuf> containing the
13100                  * whole thing.  But, if this fold is not allowed (because of
13101                  * the flags), <fold> will be the same as <value>, and should
13102                  * be processed like any other character, so skip the special
13103                  * handling */
13104                 if (folded != value) {
13105
13106                     /* Skip if we are recursed, currently parsing the class
13107                      * again.  Otherwise add this character to the list of
13108                      * multi-char folds. */
13109                     if (! RExC_in_multi_char_class) {
13110                         AV** this_array_ptr;
13111                         AV* this_array;
13112                         STRLEN cp_count = utf8_length(foldbuf,
13113                                                       foldbuf + foldlen);
13114                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13115
13116                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13117
13118
13119                         if (! multi_char_matches) {
13120                             multi_char_matches = newAV();
13121                         }
13122
13123                         /* <multi_char_matches> is actually an array of arrays.
13124                          * There will be one or two top-level elements: [2],
13125                          * and/or [3].  The [2] element is an array, each
13126                          * element thereof is a character which folds to TWO
13127                          * characters; [3] is for folds to THREE characters.
13128                          * (Unicode guarantees a maximum of 3 characters in any
13129                          * fold.)  When we rewrite the character class below,
13130                          * we will do so such that the longest folds are
13131                          * written first, so that it prefers the longest
13132                          * matching strings first.  This is done even if it
13133                          * turns out that any quantifier is non-greedy, out of
13134                          * programmer laziness.  Tom Christiansen has agreed
13135                          * that this is ok.  This makes the test for the
13136                          * ligature 'ffi' come before the test for 'ff' */
13137                         if (av_exists(multi_char_matches, cp_count)) {
13138                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
13139                                                              cp_count, FALSE);
13140                             this_array = *this_array_ptr;
13141                         }
13142                         else {
13143                             this_array = newAV();
13144                             av_store(multi_char_matches, cp_count,
13145                                      (SV*) this_array);
13146                         }
13147                         av_push(this_array, multi_fold);
13148                     }
13149
13150                     /* This element should not be processed further in this
13151                      * class */
13152                     element_count--;
13153                     value = save_value;
13154                     prevvalue = save_prevvalue;
13155                     continue;
13156                 }
13157             }
13158         }
13159
13160         /* Deal with this element of the class */
13161         if (! SIZE_ONLY) {
13162 #ifndef EBCDIC
13163             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13164 #else
13165             SV* this_range = _new_invlist(1);
13166             _append_range_to_invlist(this_range, prevvalue, value);
13167
13168             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13169              * If this range was specified using something like 'i-j', we want
13170              * to include only the 'i' and the 'j', and not anything in
13171              * between, so exclude non-ASCII, non-alphabetics from it.
13172              * However, if the range was specified with something like
13173              * [\x89-\x91] or [\x89-j], all code points within it should be
13174              * included.  literal_endpoint==2 means both ends of the range used
13175              * a literal character, not \x{foo} */
13176             if (literal_endpoint == 2
13177                 && (prevvalue >= 'a' && value <= 'z')
13178                     || (prevvalue >= 'A' && value <= 'Z'))
13179             {
13180                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13181                                       &this_range);
13182             }
13183             _invlist_union(cp_list, this_range, &cp_list);
13184             literal_endpoint = 0;
13185 #endif
13186         }
13187
13188         range = 0; /* this range (if it was one) is done now */
13189     } /* End of loop through all the text within the brackets */
13190
13191     /* If anything in the class expands to more than one character, we have to
13192      * deal with them by building up a substitute parse string, and recursively
13193      * calling reg() on it, instead of proceeding */
13194     if (multi_char_matches) {
13195         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13196         I32 cp_count;
13197         STRLEN len;
13198         char *save_end = RExC_end;
13199         char *save_parse = RExC_parse;
13200         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13201                                        a "|" */
13202         I32 reg_flags;
13203
13204         assert(! invert);
13205 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13206            because too confusing */
13207         if (invert) {
13208             sv_catpv(substitute_parse, "(?:");
13209         }
13210 #endif
13211
13212         /* Look at the longest folds first */
13213         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13214
13215             if (av_exists(multi_char_matches, cp_count)) {
13216                 AV** this_array_ptr;
13217                 SV* this_sequence;
13218
13219                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13220                                                  cp_count, FALSE);
13221                 while ((this_sequence = av_pop(*this_array_ptr)) !=
13222                                                                 &PL_sv_undef)
13223                 {
13224                     if (! first_time) {
13225                         sv_catpv(substitute_parse, "|");
13226                     }
13227                     first_time = FALSE;
13228
13229                     sv_catpv(substitute_parse, SvPVX(this_sequence));
13230                 }
13231             }
13232         }
13233
13234         /* If the character class contains anything else besides these
13235          * multi-character folds, have to include it in recursive parsing */
13236         if (element_count) {
13237             sv_catpv(substitute_parse, "|[");
13238             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13239             sv_catpv(substitute_parse, "]");
13240         }
13241
13242         sv_catpv(substitute_parse, ")");
13243 #if 0
13244         if (invert) {
13245             /* This is a way to get the parse to skip forward a whole named
13246              * sequence instead of matching the 2nd character when it fails the
13247              * first */
13248             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13249         }
13250 #endif
13251
13252         RExC_parse = SvPV(substitute_parse, len);
13253         RExC_end = RExC_parse + len;
13254         RExC_in_multi_char_class = 1;
13255         RExC_emit = (regnode *)orig_emit;
13256
13257         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13258
13259         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13260
13261         RExC_parse = save_parse;
13262         RExC_end = save_end;
13263         RExC_in_multi_char_class = 0;
13264         SvREFCNT_dec_NN(multi_char_matches);
13265         return ret;
13266     }
13267
13268     /* If the character class contains only a single element, it may be
13269      * optimizable into another node type which is smaller and runs faster.
13270      * Check if this is the case for this class */
13271     if (element_count == 1 && ! ret_invlist) {
13272         U8 op = END;
13273         U8 arg = 0;
13274
13275         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13276                                               [:digit:] or \p{foo} */
13277
13278             /* All named classes are mapped into POSIXish nodes, with its FLAG
13279              * argument giving which class it is */
13280             switch ((I32)namedclass) {
13281                 case ANYOF_UNIPROP:
13282                     break;
13283
13284                 /* These don't depend on the charset modifiers.  They always
13285                  * match under /u rules */
13286                 case ANYOF_NHORIZWS:
13287                 case ANYOF_HORIZWS:
13288                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13289                     /* FALLTHROUGH */
13290
13291                 case ANYOF_NVERTWS:
13292                 case ANYOF_VERTWS:
13293                     op = POSIXU;
13294                     goto join_posix;
13295
13296                 /* The actual POSIXish node for all the rest depends on the
13297                  * charset modifier.  The ones in the first set depend only on
13298                  * ASCII or, if available on this platform, locale */
13299                 case ANYOF_ASCII:
13300                 case ANYOF_NASCII:
13301 #ifdef HAS_ISASCII
13302                     op = (LOC) ? POSIXL : POSIXA;
13303 #else
13304                     op = POSIXA;
13305 #endif
13306                     goto join_posix;
13307
13308                 case ANYOF_NCASED:
13309                 case ANYOF_LOWER:
13310                 case ANYOF_NLOWER:
13311                 case ANYOF_UPPER:
13312                 case ANYOF_NUPPER:
13313                     /* under /a could be alpha */
13314                     if (FOLD) {
13315                         if (ASCII_RESTRICTED) {
13316                             namedclass = ANYOF_ALPHA + (namedclass % 2);
13317                         }
13318                         else if (! LOC) {
13319                             break;
13320                         }
13321                     }
13322                     /* FALLTHROUGH */
13323
13324                 /* The rest have more possibilities depending on the charset.
13325                  * We take advantage of the enum ordering of the charset
13326                  * modifiers to get the exact node type, */
13327                 default:
13328                     op = POSIXD + get_regex_charset(RExC_flags);
13329                     if (op > POSIXA) { /* /aa is same as /a */
13330                         op = POSIXA;
13331                     }
13332 #ifndef HAS_ISBLANK
13333                     if (op == POSIXL
13334                         && (namedclass == ANYOF_BLANK
13335                             || namedclass == ANYOF_NBLANK))
13336                     {
13337                         op = POSIXA;
13338                     }
13339 #endif
13340
13341                 join_posix:
13342                     /* The odd numbered ones are the complements of the
13343                      * next-lower even number one */
13344                     if (namedclass % 2 == 1) {
13345                         invert = ! invert;
13346                         namedclass--;
13347                     }
13348                     arg = namedclass_to_classnum(namedclass);
13349                     break;
13350             }
13351         }
13352         else if (value == prevvalue) {
13353
13354             /* Here, the class consists of just a single code point */
13355
13356             if (invert) {
13357                 if (! LOC && value == '\n') {
13358                     op = REG_ANY; /* Optimize [^\n] */
13359                     *flagp |= HASWIDTH|SIMPLE;
13360                     RExC_naughty++;
13361                 }
13362             }
13363             else if (value < 256 || UTF) {
13364
13365                 /* Optimize a single value into an EXACTish node, but not if it
13366                  * would require converting the pattern to UTF-8. */
13367                 op = compute_EXACTish(pRExC_state);
13368             }
13369         } /* Otherwise is a range */
13370         else if (! LOC) {   /* locale could vary these */
13371             if (prevvalue == '0') {
13372                 if (value == '9') {
13373                     arg = _CC_DIGIT;
13374                     op = POSIXA;
13375                 }
13376             }
13377         }
13378
13379         /* Here, we have changed <op> away from its initial value iff we found
13380          * an optimization */
13381         if (op != END) {
13382
13383             /* Throw away this ANYOF regnode, and emit the calculated one,
13384              * which should correspond to the beginning, not current, state of
13385              * the parse */
13386             const char * cur_parse = RExC_parse;
13387             RExC_parse = (char *)orig_parse;
13388             if ( SIZE_ONLY) {
13389                 if (! LOC) {
13390
13391                     /* To get locale nodes to not use the full ANYOF size would
13392                      * require moving the code above that writes the portions
13393                      * of it that aren't in other nodes to after this point.
13394                      * e.g.  ANYOF_CLASS_SET */
13395                     RExC_size = orig_size;
13396                 }
13397             }
13398             else {
13399                 RExC_emit = (regnode *)orig_emit;
13400                 if (PL_regkind[op] == POSIXD) {
13401                     if (invert) {
13402                         op += NPOSIXD - POSIXD;
13403                     }
13404                 }
13405             }
13406
13407             ret = reg_node(pRExC_state, op);
13408
13409             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13410                 if (! SIZE_ONLY) {
13411                     FLAGS(ret) = arg;
13412                 }
13413                 *flagp |= HASWIDTH|SIMPLE;
13414             }
13415             else if (PL_regkind[op] == EXACT) {
13416                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13417             }
13418
13419             RExC_parse = (char *) cur_parse;
13420
13421             SvREFCNT_dec(posixes);
13422             SvREFCNT_dec(cp_list);
13423             return ret;
13424         }
13425     }
13426
13427     if (SIZE_ONLY)
13428         return ret;
13429     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13430
13431     /* If folding, we calculate all characters that could fold to or from the
13432      * ones already on the list */
13433     if (FOLD && cp_list) {
13434         UV start, end;  /* End points of code point ranges */
13435
13436         SV* fold_intersection = NULL;
13437
13438         /* If the highest code point is within Latin1, we can use the
13439          * compiled-in Alphas list, and not have to go out to disk.  This
13440          * yields two false positives, the masculine and feminine ordinal
13441          * indicators, which are weeded out below using the
13442          * IS_IN_SOME_FOLD_L1() macro */
13443         if (invlist_highest(cp_list) < 256) {
13444             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13445                                                            &fold_intersection);
13446         }
13447         else {
13448
13449             /* Here, there are non-Latin1 code points, so we will have to go
13450              * fetch the list of all the characters that participate in folds
13451              */
13452             if (! PL_utf8_foldable) {
13453                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13454                                        &PL_sv_undef, 1, 0);
13455                 PL_utf8_foldable = _get_swash_invlist(swash);
13456                 SvREFCNT_dec_NN(swash);
13457             }
13458
13459             /* This is a hash that for a particular fold gives all characters
13460              * that are involved in it */
13461             if (! PL_utf8_foldclosures) {
13462
13463                 /* If we were unable to find any folds, then we likely won't be
13464                  * able to find the closures.  So just create an empty list.
13465                  * Folding will effectively be restricted to the non-Unicode
13466                  * rules hard-coded into Perl.  (This case happens legitimately
13467                  * during compilation of Perl itself before the Unicode tables
13468                  * are generated) */
13469                 if (_invlist_len(PL_utf8_foldable) == 0) {
13470                     PL_utf8_foldclosures = newHV();
13471                 }
13472                 else {
13473                     /* If the folds haven't been read in, call a fold function
13474                      * to force that */
13475                     if (! PL_utf8_tofold) {
13476                         U8 dummy[UTF8_MAXBYTES+1];
13477
13478                         /* This string is just a short named one above \xff */
13479                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13480                         assert(PL_utf8_tofold); /* Verify that worked */
13481                     }
13482                     PL_utf8_foldclosures =
13483                                     _swash_inversion_hash(PL_utf8_tofold);
13484                 }
13485             }
13486
13487             /* Only the characters in this class that participate in folds need
13488              * be checked.  Get the intersection of this class and all the
13489              * possible characters that are foldable.  This can quickly narrow
13490              * down a large class */
13491             _invlist_intersection(PL_utf8_foldable, cp_list,
13492                                   &fold_intersection);
13493         }
13494
13495         /* Now look at the foldable characters in this class individually */
13496         invlist_iterinit(fold_intersection);
13497         while (invlist_iternext(fold_intersection, &start, &end)) {
13498             UV j;
13499
13500             /* Locale folding for Latin1 characters is deferred until runtime */
13501             if (LOC && start < 256) {
13502                 start = 256;
13503             }
13504
13505             /* Look at every character in the range */
13506             for (j = start; j <= end; j++) {
13507
13508                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13509                 STRLEN foldlen;
13510                 SV** listp;
13511
13512                 if (j < 256) {
13513
13514                     /* We have the latin1 folding rules hard-coded here so that
13515                      * an innocent-looking character class, like /[ks]/i won't
13516                      * have to go out to disk to find the possible matches.
13517                      * XXX It would be better to generate these via regen, in
13518                      * case a new version of the Unicode standard adds new
13519                      * mappings, though that is not really likely, and may be
13520                      * caught by the default: case of the switch below. */
13521
13522                     if (IS_IN_SOME_FOLD_L1(j)) {
13523
13524                         /* ASCII is always matched; non-ASCII is matched only
13525                          * under Unicode rules */
13526                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13527                             cp_list =
13528                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13529                         }
13530                         else {
13531                             depends_list =
13532                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13533                         }
13534                     }
13535
13536                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13537                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13538                     {
13539                         /* Certain Latin1 characters have matches outside
13540                          * Latin1.  To get here, <j> is one of those
13541                          * characters.   None of these matches is valid for
13542                          * ASCII characters under /aa, which is why the 'if'
13543                          * just above excludes those.  These matches only
13544                          * happen when the target string is utf8.  The code
13545                          * below adds the single fold closures for <j> to the
13546                          * inversion list. */
13547                         switch (j) {
13548                             case 'k':
13549                             case 'K':
13550                                 cp_list =
13551                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
13552                                 break;
13553                             case 's':
13554                             case 'S':
13555                                 cp_list = add_cp_to_invlist(cp_list,
13556                                                     LATIN_SMALL_LETTER_LONG_S);
13557                                 break;
13558                             case MICRO_SIGN:
13559                                 cp_list = add_cp_to_invlist(cp_list,
13560                                                     GREEK_CAPITAL_LETTER_MU);
13561                                 cp_list = add_cp_to_invlist(cp_list,
13562                                                     GREEK_SMALL_LETTER_MU);
13563                                 break;
13564                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13565                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13566                                 cp_list =
13567                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13568                                 break;
13569                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13570                                 cp_list = add_cp_to_invlist(cp_list,
13571                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13572                                 break;
13573                             case LATIN_SMALL_LETTER_SHARP_S:
13574                                 cp_list = add_cp_to_invlist(cp_list,
13575                                                 LATIN_CAPITAL_LETTER_SHARP_S);
13576                                 break;
13577                             case 'F': case 'f':
13578                             case 'I': case 'i':
13579                             case 'L': case 'l':
13580                             case 'T': case 't':
13581                             case 'A': case 'a':
13582                             case 'H': case 'h':
13583                             case 'J': case 'j':
13584                             case 'N': case 'n':
13585                             case 'W': case 'w':
13586                             case 'Y': case 'y':
13587                                 /* These all are targets of multi-character
13588                                  * folds from code points that require UTF8 to
13589                                  * express, so they can't match unless the
13590                                  * target string is in UTF-8, so no action here
13591                                  * is necessary, as regexec.c properly handles
13592                                  * the general case for UTF-8 matching and
13593                                  * multi-char folds */
13594                                 break;
13595                             default:
13596                                 /* Use deprecated warning to increase the
13597                                  * chances of this being output */
13598                                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13599                                 break;
13600                         }
13601                     }
13602                     continue;
13603                 }
13604
13605                 /* Here is an above Latin1 character.  We don't have the rules
13606                  * hard-coded for it.  First, get its fold.  This is the simple
13607                  * fold, as the multi-character folds have been handled earlier
13608                  * and separated out */
13609                 _to_uni_fold_flags(j, foldbuf, &foldlen,
13610                                                ((LOC)
13611                                                ? FOLD_FLAGS_LOCALE
13612                                                : (ASCII_FOLD_RESTRICTED)
13613                                                   ? FOLD_FLAGS_NOMIX_ASCII
13614                                                   : 0));
13615
13616                 /* Single character fold of above Latin1.  Add everything in
13617                  * its fold closure to the list that this node should match.
13618                  * The fold closures data structure is a hash with the keys
13619                  * being the UTF-8 of every character that is folded to, like
13620                  * 'k', and the values each an array of all code points that
13621                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13622                  * Multi-character folds are not included */
13623                 if ((listp = hv_fetch(PL_utf8_foldclosures,
13624                                       (char *) foldbuf, foldlen, FALSE)))
13625                 {
13626                     AV* list = (AV*) *listp;
13627                     IV k;
13628                     for (k = 0; k <= av_len(list); k++) {
13629                         SV** c_p = av_fetch(list, k, FALSE);
13630                         UV c;
13631                         if (c_p == NULL) {
13632                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13633                         }
13634                         c = SvUV(*c_p);
13635
13636                         /* /aa doesn't allow folds between ASCII and non-; /l
13637                          * doesn't allow them between above and below 256 */
13638                         if ((ASCII_FOLD_RESTRICTED
13639                                   && (isASCII(c) != isASCII(j)))
13640                             || (LOC && c < 256)) {
13641                             continue;
13642                         }
13643
13644                         /* Folds involving non-ascii Latin1 characters
13645                          * under /d are added to a separate list */
13646                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13647                         {
13648                             cp_list = add_cp_to_invlist(cp_list, c);
13649                         }
13650                         else {
13651                           depends_list = add_cp_to_invlist(depends_list, c);
13652                         }
13653                     }
13654                 }
13655             }
13656         }
13657         SvREFCNT_dec_NN(fold_intersection);
13658     }
13659
13660     /* And combine the result (if any) with any inversion list from posix
13661      * classes.  The lists are kept separate up to now because we don't want to
13662      * fold the classes (folding of those is automatically handled by the swash
13663      * fetching code) */
13664     if (posixes) {
13665         if (! DEPENDS_SEMANTICS) {
13666             if (cp_list) {
13667                 _invlist_union(cp_list, posixes, &cp_list);
13668                 SvREFCNT_dec_NN(posixes);
13669             }
13670             else {
13671                 cp_list = posixes;
13672             }
13673         }
13674         else {
13675             /* Under /d, we put into a separate list the Latin1 things that
13676              * match only when the target string is utf8 */
13677             SV* nonascii_but_latin1_properties = NULL;
13678             _invlist_intersection(posixes, PL_Latin1,
13679                                   &nonascii_but_latin1_properties);
13680             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13681                               &nonascii_but_latin1_properties);
13682             _invlist_subtract(posixes, nonascii_but_latin1_properties,
13683                               &posixes);
13684             if (cp_list) {
13685                 _invlist_union(cp_list, posixes, &cp_list);
13686                 SvREFCNT_dec_NN(posixes);
13687             }
13688             else {
13689                 cp_list = posixes;
13690             }
13691
13692             if (depends_list) {
13693                 _invlist_union(depends_list, nonascii_but_latin1_properties,
13694                                &depends_list);
13695                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13696             }
13697             else {
13698                 depends_list = nonascii_but_latin1_properties;
13699             }
13700         }
13701     }
13702
13703     /* And combine the result (if any) with any inversion list from properties.
13704      * The lists are kept separate up to now so that we can distinguish the two
13705      * in regards to matching above-Unicode.  A run-time warning is generated
13706      * if a Unicode property is matched against a non-Unicode code point. But,
13707      * we allow user-defined properties to match anything, without any warning,
13708      * and we also suppress the warning if there is a portion of the character
13709      * class that isn't a Unicode property, and which matches above Unicode, \W
13710      * or [\x{110000}] for example.
13711      * (Note that in this case, unlike the Posix one above, there is no
13712      * <depends_list>, because having a Unicode property forces Unicode
13713      * semantics */
13714     if (properties) {
13715         bool warn_super = ! has_user_defined_property;
13716         if (cp_list) {
13717
13718             /* If it matters to the final outcome, see if a non-property
13719              * component of the class matches above Unicode.  If so, the
13720              * warning gets suppressed.  This is true even if just a single
13721              * such code point is specified, as though not strictly correct if
13722              * another such code point is matched against, the fact that they
13723              * are using above-Unicode code points indicates they should know
13724              * the issues involved */
13725             if (warn_super) {
13726                 bool non_prop_matches_above_Unicode =
13727                             runtime_posix_matches_above_Unicode
13728                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13729                 if (invert) {
13730                     non_prop_matches_above_Unicode =
13731                                             !  non_prop_matches_above_Unicode;
13732                 }
13733                 warn_super = ! non_prop_matches_above_Unicode;
13734             }
13735
13736             _invlist_union(properties, cp_list, &cp_list);
13737             SvREFCNT_dec_NN(properties);
13738         }
13739         else {
13740             cp_list = properties;
13741         }
13742
13743         if (warn_super) {
13744             OP(ret) = ANYOF_WARN_SUPER;
13745         }
13746     }
13747
13748     /* Here, we have calculated what code points should be in the character
13749      * class.
13750      *
13751      * Now we can see about various optimizations.  Fold calculation (which we
13752      * did above) needs to take place before inversion.  Otherwise /[^k]/i
13753      * would invert to include K, which under /i would match k, which it
13754      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13755      * folded until runtime */
13756
13757     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13758      * at compile time.  Besides not inverting folded locale now, we can't
13759      * invert if there are things such as \w, which aren't known until runtime
13760      * */
13761     if (invert
13762         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13763         && ! depends_list
13764         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13765     {
13766         _invlist_invert(cp_list);
13767
13768         /* Any swash can't be used as-is, because we've inverted things */
13769         if (swash) {
13770             SvREFCNT_dec_NN(swash);
13771             swash = NULL;
13772         }
13773
13774         /* Clear the invert flag since have just done it here */
13775         invert = FALSE;
13776     }
13777
13778     if (ret_invlist) {
13779         *ret_invlist = cp_list;
13780         SvREFCNT_dec(swash);
13781
13782         /* Discard the generated node */
13783         if (SIZE_ONLY) {
13784             RExC_size = orig_size;
13785         }
13786         else {
13787             RExC_emit = orig_emit;
13788         }
13789         return orig_emit;
13790     }
13791
13792     /* If we didn't do folding, it's because some information isn't available
13793      * until runtime; set the run-time fold flag for these.  (We don't have to
13794      * worry about properties folding, as that is taken care of by the swash
13795      * fetching) */
13796     if (FOLD && LOC)
13797     {
13798        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13799     }
13800
13801     /* Some character classes are equivalent to other nodes.  Such nodes take
13802      * up less room and generally fewer operations to execute than ANYOF nodes.
13803      * Above, we checked for and optimized into some such equivalents for
13804      * certain common classes that are easy to test.  Getting to this point in
13805      * the code means that the class didn't get optimized there.  Since this
13806      * code is only executed in Pass 2, it is too late to save space--it has
13807      * been allocated in Pass 1, and currently isn't given back.  But turning
13808      * things into an EXACTish node can allow the optimizer to join it to any
13809      * adjacent such nodes.  And if the class is equivalent to things like /./,
13810      * expensive run-time swashes can be avoided.  Now that we have more
13811      * complete information, we can find things necessarily missed by the
13812      * earlier code.  I (khw) am not sure how much to look for here.  It would
13813      * be easy, but perhaps too slow, to check any candidates against all the
13814      * node types they could possibly match using _invlistEQ(). */
13815
13816     if (cp_list
13817         && ! invert
13818         && ! depends_list
13819         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13820         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13821     {
13822         UV start, end;
13823         U8 op = END;  /* The optimzation node-type */
13824         const char * cur_parse= RExC_parse;
13825
13826         invlist_iterinit(cp_list);
13827         if (! invlist_iternext(cp_list, &start, &end)) {
13828
13829             /* Here, the list is empty.  This happens, for example, when a
13830              * Unicode property is the only thing in the character class, and
13831              * it doesn't match anything.  (perluniprops.pod notes such
13832              * properties) */
13833             op = OPFAIL;
13834             *flagp |= HASWIDTH|SIMPLE;
13835         }
13836         else if (start == end) {    /* The range is a single code point */
13837             if (! invlist_iternext(cp_list, &start, &end)
13838
13839                     /* Don't do this optimization if it would require changing
13840                      * the pattern to UTF-8 */
13841                 && (start < 256 || UTF))
13842             {
13843                 /* Here, the list contains a single code point.  Can optimize
13844                  * into an EXACT node */
13845
13846                 value = start;
13847
13848                 if (! FOLD) {
13849                     op = EXACT;
13850                 }
13851                 else if (LOC) {
13852
13853                     /* A locale node under folding with one code point can be
13854                      * an EXACTFL, as its fold won't be calculated until
13855                      * runtime */
13856                     op = EXACTFL;
13857                 }
13858                 else {
13859
13860                     /* Here, we are generally folding, but there is only one
13861                      * code point to match.  If we have to, we use an EXACT
13862                      * node, but it would be better for joining with adjacent
13863                      * nodes in the optimization pass if we used the same
13864                      * EXACTFish node that any such are likely to be.  We can
13865                      * do this iff the code point doesn't participate in any
13866                      * folds.  For example, an EXACTF of a colon is the same as
13867                      * an EXACT one, since nothing folds to or from a colon. */
13868                     if (value < 256) {
13869                         if (IS_IN_SOME_FOLD_L1(value)) {
13870                             op = EXACT;
13871                         }
13872                     }
13873                     else {
13874                         if (! PL_utf8_foldable) {
13875                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13876                                                 &PL_sv_undef, 1, 0);
13877                             PL_utf8_foldable = _get_swash_invlist(swash);
13878                             SvREFCNT_dec_NN(swash);
13879                         }
13880                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13881                             op = EXACT;
13882                         }
13883                     }
13884
13885                     /* If we haven't found the node type, above, it means we
13886                      * can use the prevailing one */
13887                     if (op == END) {
13888                         op = compute_EXACTish(pRExC_state);
13889                     }
13890                 }
13891             }
13892         }
13893         else if (start == 0) {
13894             if (end == UV_MAX) {
13895                 op = SANY;
13896                 *flagp |= HASWIDTH|SIMPLE;
13897                 RExC_naughty++;
13898             }
13899             else if (end == '\n' - 1
13900                     && invlist_iternext(cp_list, &start, &end)
13901                     && start == '\n' + 1 && end == UV_MAX)
13902             {
13903                 op = REG_ANY;
13904                 *flagp |= HASWIDTH|SIMPLE;
13905                 RExC_naughty++;
13906             }
13907         }
13908         invlist_iterfinish(cp_list);
13909
13910         if (op != END) {
13911             RExC_parse = (char *)orig_parse;
13912             RExC_emit = (regnode *)orig_emit;
13913
13914             ret = reg_node(pRExC_state, op);
13915
13916             RExC_parse = (char *)cur_parse;
13917
13918             if (PL_regkind[op] == EXACT) {
13919                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13920             }
13921
13922             SvREFCNT_dec_NN(cp_list);
13923             return ret;
13924         }
13925     }
13926
13927     /* Here, <cp_list> contains all the code points we can determine at
13928      * compile time that match under all conditions.  Go through it, and
13929      * for things that belong in the bitmap, put them there, and delete from
13930      * <cp_list>.  While we are at it, see if everything above 255 is in the
13931      * list, and if so, set a flag to speed up execution */
13932     ANYOF_BITMAP_ZERO(ret);
13933     if (cp_list) {
13934
13935         /* This gets set if we actually need to modify things */
13936         bool change_invlist = FALSE;
13937
13938         UV start, end;
13939
13940         /* Start looking through <cp_list> */
13941         invlist_iterinit(cp_list);
13942         while (invlist_iternext(cp_list, &start, &end)) {
13943             UV high;
13944             int i;
13945
13946             if (end == UV_MAX && start <= 256) {
13947                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13948             }
13949
13950             /* Quit if are above what we should change */
13951             if (start > 255) {
13952                 break;
13953             }
13954
13955             change_invlist = TRUE;
13956
13957             /* Set all the bits in the range, up to the max that we are doing */
13958             high = (end < 255) ? end : 255;
13959             for (i = start; i <= (int) high; i++) {
13960                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13961                     ANYOF_BITMAP_SET(ret, i);
13962                 }
13963             }
13964         }
13965         invlist_iterfinish(cp_list);
13966
13967         /* Done with loop; remove any code points that are in the bitmap from
13968          * <cp_list> */
13969         if (change_invlist) {
13970             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13971         }
13972
13973         /* If have completely emptied it, remove it completely */
13974         if (_invlist_len(cp_list) == 0) {
13975             SvREFCNT_dec_NN(cp_list);
13976             cp_list = NULL;
13977         }
13978     }
13979
13980     if (invert) {
13981         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13982     }
13983
13984     /* Here, the bitmap has been populated with all the Latin1 code points that
13985      * always match.  Can now add to the overall list those that match only
13986      * when the target string is UTF-8 (<depends_list>). */
13987     if (depends_list) {
13988         if (cp_list) {
13989             _invlist_union(cp_list, depends_list, &cp_list);
13990             SvREFCNT_dec_NN(depends_list);
13991         }
13992         else {
13993             cp_list = depends_list;
13994         }
13995     }
13996
13997     /* If there is a swash and more than one element, we can't use the swash in
13998      * the optimization below. */
13999     if (swash && element_count > 1) {
14000         SvREFCNT_dec_NN(swash);
14001         swash = NULL;
14002     }
14003
14004     if (! cp_list
14005         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14006     {
14007         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
14008     }
14009     else {
14010         /* av[0] stores the character class description in its textual form:
14011          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
14012          *       appropriate swash, and is also useful for dumping the regnode.
14013          * av[1] if NULL, is a placeholder to later contain the swash computed
14014          *       from av[0].  But if no further computation need be done, the
14015          *       swash is stored there now.
14016          * av[2] stores the cp_list inversion list for use in addition or
14017          *       instead of av[0]; used only if av[1] is NULL
14018          * av[3] is set if any component of the class is from a user-defined
14019          *       property; used only if av[1] is NULL */
14020         AV * const av = newAV();
14021         SV *rv;
14022
14023         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14024                         ? SvREFCNT_inc(listsv) : &PL_sv_undef);
14025         if (swash) {
14026             av_store(av, 1, swash);
14027             SvREFCNT_dec_NN(cp_list);
14028         }
14029         else {
14030             av_store(av, 1, NULL);
14031             if (cp_list) {
14032                 av_store(av, 2, cp_list);
14033                 av_store(av, 3, newSVuv(has_user_defined_property));
14034             }
14035         }
14036
14037         rv = newRV_noinc(MUTABLE_SV(av));
14038         n = add_data(pRExC_state, 1, "s");
14039         RExC_rxi->data->data[n] = (void*)rv;
14040         ARG_SET(ret, n);
14041     }
14042
14043     *flagp |= HASWIDTH|SIMPLE;
14044     return ret;
14045 }
14046 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14047
14048
14049 /* reg_skipcomment()
14050
14051    Absorbs an /x style # comments from the input stream.
14052    Returns true if there is more text remaining in the stream.
14053    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14054    terminates the pattern without including a newline.
14055
14056    Note its the callers responsibility to ensure that we are
14057    actually in /x mode
14058
14059 */
14060
14061 STATIC bool
14062 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14063 {
14064     bool ended = 0;
14065
14066     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14067
14068     while (RExC_parse < RExC_end)
14069         if (*RExC_parse++ == '\n') {
14070             ended = 1;
14071             break;
14072         }
14073     if (!ended) {
14074         /* we ran off the end of the pattern without ending
14075            the comment, so we have to add an \n when wrapping */
14076         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14077         return 0;
14078     } else
14079         return 1;
14080 }
14081
14082 /* nextchar()
14083
14084    Advances the parse position, and optionally absorbs
14085    "whitespace" from the inputstream.
14086
14087    Without /x "whitespace" means (?#...) style comments only,
14088    with /x this means (?#...) and # comments and whitespace proper.
14089
14090    Returns the RExC_parse point from BEFORE the scan occurs.
14091
14092    This is the /x friendly way of saying RExC_parse++.
14093 */
14094
14095 STATIC char*
14096 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14097 {
14098     char* const retval = RExC_parse++;
14099
14100     PERL_ARGS_ASSERT_NEXTCHAR;
14101
14102     for (;;) {
14103         if (RExC_end - RExC_parse >= 3
14104             && *RExC_parse == '('
14105             && RExC_parse[1] == '?'
14106             && RExC_parse[2] == '#')
14107         {
14108             while (*RExC_parse != ')') {
14109                 if (RExC_parse == RExC_end)
14110                     FAIL("Sequence (?#... not terminated");
14111                 RExC_parse++;
14112             }
14113             RExC_parse++;
14114             continue;
14115         }
14116         if (RExC_flags & RXf_PMf_EXTENDED) {
14117             if (isSPACE(*RExC_parse)) {
14118                 RExC_parse++;
14119                 continue;
14120             }
14121             else if (*RExC_parse == '#') {
14122                 if ( reg_skipcomment( pRExC_state ) )
14123                     continue;
14124             }
14125         }
14126         return retval;
14127     }
14128 }
14129
14130 /*
14131 - reg_node - emit a node
14132 */
14133 STATIC regnode *                        /* Location. */
14134 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14135 {
14136     dVAR;
14137     regnode *ptr;
14138     regnode * const ret = RExC_emit;
14139     GET_RE_DEBUG_FLAGS_DECL;
14140
14141     PERL_ARGS_ASSERT_REG_NODE;
14142
14143     if (SIZE_ONLY) {
14144         SIZE_ALIGN(RExC_size);
14145         RExC_size += 1;
14146         return(ret);
14147     }
14148     if (RExC_emit >= RExC_emit_bound)
14149         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14150                    op, RExC_emit, RExC_emit_bound);
14151
14152     NODE_ALIGN_FILL(ret);
14153     ptr = ret;
14154     FILL_ADVANCE_NODE(ptr, op);
14155 #ifdef RE_TRACK_PATTERN_OFFSETS
14156     if (RExC_offsets) {         /* MJD */
14157         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14158               "reg_node", __LINE__,
14159               PL_reg_name[op],
14160               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14161                 ? "Overwriting end of array!\n" : "OK",
14162               (UV)(RExC_emit - RExC_emit_start),
14163               (UV)(RExC_parse - RExC_start),
14164               (UV)RExC_offsets[0]));
14165         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14166     }
14167 #endif
14168     RExC_emit = ptr;
14169     return(ret);
14170 }
14171
14172 /*
14173 - reganode - emit a node with an argument
14174 */
14175 STATIC regnode *                        /* Location. */
14176 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14177 {
14178     dVAR;
14179     regnode *ptr;
14180     regnode * const ret = RExC_emit;
14181     GET_RE_DEBUG_FLAGS_DECL;
14182
14183     PERL_ARGS_ASSERT_REGANODE;
14184
14185     if (SIZE_ONLY) {
14186         SIZE_ALIGN(RExC_size);
14187         RExC_size += 2;
14188         /*
14189            We can't do this:
14190
14191            assert(2==regarglen[op]+1);
14192
14193            Anything larger than this has to allocate the extra amount.
14194            If we changed this to be:
14195
14196            RExC_size += (1 + regarglen[op]);
14197
14198            then it wouldn't matter. Its not clear what side effect
14199            might come from that so its not done so far.
14200            -- dmq
14201         */
14202         return(ret);
14203     }
14204     if (RExC_emit >= RExC_emit_bound)
14205         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14206                    op, RExC_emit, RExC_emit_bound);
14207
14208     NODE_ALIGN_FILL(ret);
14209     ptr = ret;
14210     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14211 #ifdef RE_TRACK_PATTERN_OFFSETS
14212     if (RExC_offsets) {         /* MJD */
14213         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14214               "reganode",
14215               __LINE__,
14216               PL_reg_name[op],
14217               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14218               "Overwriting end of array!\n" : "OK",
14219               (UV)(RExC_emit - RExC_emit_start),
14220               (UV)(RExC_parse - RExC_start),
14221               (UV)RExC_offsets[0]));
14222         Set_Cur_Node_Offset;
14223     }
14224 #endif
14225     RExC_emit = ptr;
14226     return(ret);
14227 }
14228
14229 /*
14230 - reguni - emit (if appropriate) a Unicode character
14231 */
14232 STATIC STRLEN
14233 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14234 {
14235     dVAR;
14236
14237     PERL_ARGS_ASSERT_REGUNI;
14238
14239     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14240 }
14241
14242 /*
14243 - reginsert - insert an operator in front of already-emitted operand
14244 *
14245 * Means relocating the operand.
14246 */
14247 STATIC void
14248 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14249 {
14250     dVAR;
14251     regnode *src;
14252     regnode *dst;
14253     regnode *place;
14254     const int offset = regarglen[(U8)op];
14255     const int size = NODE_STEP_REGNODE + offset;
14256     GET_RE_DEBUG_FLAGS_DECL;
14257
14258     PERL_ARGS_ASSERT_REGINSERT;
14259     PERL_UNUSED_ARG(depth);
14260 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14261     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14262     if (SIZE_ONLY) {
14263         RExC_size += size;
14264         return;
14265     }
14266
14267     src = RExC_emit;
14268     RExC_emit += size;
14269     dst = RExC_emit;
14270     if (RExC_open_parens) {
14271         int paren;
14272         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14273         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14274             if ( RExC_open_parens[paren] >= opnd ) {
14275                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14276                 RExC_open_parens[paren] += size;
14277             } else {
14278                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14279             }
14280             if ( RExC_close_parens[paren] >= opnd ) {
14281                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14282                 RExC_close_parens[paren] += size;
14283             } else {
14284                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14285             }
14286         }
14287     }
14288
14289     while (src > opnd) {
14290         StructCopy(--src, --dst, regnode);
14291 #ifdef RE_TRACK_PATTERN_OFFSETS
14292         if (RExC_offsets) {     /* MJD 20010112 */
14293             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14294                   "reg_insert",
14295                   __LINE__,
14296                   PL_reg_name[op],
14297                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14298                     ? "Overwriting end of array!\n" : "OK",
14299                   (UV)(src - RExC_emit_start),
14300                   (UV)(dst - RExC_emit_start),
14301                   (UV)RExC_offsets[0]));
14302             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14303             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14304         }
14305 #endif
14306     }
14307
14308
14309     place = opnd;               /* Op node, where operand used to be. */
14310 #ifdef RE_TRACK_PATTERN_OFFSETS
14311     if (RExC_offsets) {         /* MJD */
14312         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14313               "reginsert",
14314               __LINE__,
14315               PL_reg_name[op],
14316               (UV)(place - RExC_emit_start) > RExC_offsets[0]
14317               ? "Overwriting end of array!\n" : "OK",
14318               (UV)(place - RExC_emit_start),
14319               (UV)(RExC_parse - RExC_start),
14320               (UV)RExC_offsets[0]));
14321         Set_Node_Offset(place, RExC_parse);
14322         Set_Node_Length(place, 1);
14323     }
14324 #endif
14325     src = NEXTOPER(place);
14326     FILL_ADVANCE_NODE(place, op);
14327     Zero(src, offset, regnode);
14328 }
14329
14330 /*
14331 - regtail - set the next-pointer at the end of a node chain of p to val.
14332 - SEE ALSO: regtail_study
14333 */
14334 /* TODO: All three parms should be const */
14335 STATIC void
14336 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14337 {
14338     dVAR;
14339     regnode *scan;
14340     GET_RE_DEBUG_FLAGS_DECL;
14341
14342     PERL_ARGS_ASSERT_REGTAIL;
14343 #ifndef DEBUGGING
14344     PERL_UNUSED_ARG(depth);
14345 #endif
14346
14347     if (SIZE_ONLY)
14348         return;
14349
14350     /* Find last node. */
14351     scan = p;
14352     for (;;) {
14353         regnode * const temp = regnext(scan);
14354         DEBUG_PARSE_r({
14355             SV * const mysv=sv_newmortal();
14356             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14357             regprop(RExC_rx, mysv, scan);
14358             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14359                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14360                     (temp == NULL ? "->" : ""),
14361                     (temp == NULL ? PL_reg_name[OP(val)] : "")
14362             );
14363         });
14364         if (temp == NULL)
14365             break;
14366         scan = temp;
14367     }
14368
14369     if (reg_off_by_arg[OP(scan)]) {
14370         ARG_SET(scan, val - scan);
14371     }
14372     else {
14373         NEXT_OFF(scan) = val - scan;
14374     }
14375 }
14376
14377 #ifdef DEBUGGING
14378 /*
14379 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14380 - Look for optimizable sequences at the same time.
14381 - currently only looks for EXACT chains.
14382
14383 This is experimental code. The idea is to use this routine to perform
14384 in place optimizations on branches and groups as they are constructed,
14385 with the long term intention of removing optimization from study_chunk so
14386 that it is purely analytical.
14387
14388 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14389 to control which is which.
14390
14391 */
14392 /* TODO: All four parms should be const */
14393
14394 STATIC U8
14395 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14396 {
14397     dVAR;
14398     regnode *scan;
14399     U8 exact = PSEUDO;
14400 #ifdef EXPERIMENTAL_INPLACESCAN
14401     I32 min = 0;
14402 #endif
14403     GET_RE_DEBUG_FLAGS_DECL;
14404
14405     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14406
14407
14408     if (SIZE_ONLY)
14409         return exact;
14410
14411     /* Find last node. */
14412
14413     scan = p;
14414     for (;;) {
14415         regnode * const temp = regnext(scan);
14416 #ifdef EXPERIMENTAL_INPLACESCAN
14417         if (PL_regkind[OP(scan)] == EXACT) {
14418             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14419             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14420                 return EXACT;
14421         }
14422 #endif
14423         if ( exact ) {
14424             switch (OP(scan)) {
14425                 case EXACT:
14426                 case EXACTF:
14427                 case EXACTFA:
14428                 case EXACTFU:
14429                 case EXACTFU_SS:
14430                 case EXACTFU_TRICKYFOLD:
14431                 case EXACTFL:
14432                         if( exact == PSEUDO )
14433                             exact= OP(scan);
14434                         else if ( exact != OP(scan) )
14435                             exact= 0;
14436                 case NOTHING:
14437                     break;
14438                 default:
14439                     exact= 0;
14440             }
14441         }
14442         DEBUG_PARSE_r({
14443             SV * const mysv=sv_newmortal();
14444             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14445             regprop(RExC_rx, mysv, scan);
14446             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14447                 SvPV_nolen_const(mysv),
14448                 REG_NODE_NUM(scan),
14449                 PL_reg_name[exact]);
14450         });
14451         if (temp == NULL)
14452             break;
14453         scan = temp;
14454     }
14455     DEBUG_PARSE_r({
14456         SV * const mysv_val=sv_newmortal();
14457         DEBUG_PARSE_MSG("");
14458         regprop(RExC_rx, mysv_val, val);
14459         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14460                       SvPV_nolen_const(mysv_val),
14461                       (IV)REG_NODE_NUM(val),
14462                       (IV)(val - scan)
14463         );
14464     });
14465     if (reg_off_by_arg[OP(scan)]) {
14466         ARG_SET(scan, val - scan);
14467     }
14468     else {
14469         NEXT_OFF(scan) = val - scan;
14470     }
14471
14472     return exact;
14473 }
14474 #endif
14475
14476 /*
14477  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14478  */
14479 #ifdef DEBUGGING
14480
14481 static void
14482 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14483 {
14484     int bit;
14485     int set=0;
14486
14487     for (bit=0; bit<32; bit++) {
14488         if (flags & (1<<bit)) {
14489             if (!set++ && lead)
14490                 PerlIO_printf(Perl_debug_log, "%s",lead);
14491             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14492         }
14493     }
14494     if (lead)  {
14495         if (set)
14496             PerlIO_printf(Perl_debug_log, "\n");
14497         else
14498             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14499     }
14500 }
14501
14502 static void
14503 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14504 {
14505     int bit;
14506     int set=0;
14507     regex_charset cs;
14508
14509     for (bit=0; bit<32; bit++) {
14510         if (flags & (1<<bit)) {
14511             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
14512                 continue;
14513             }
14514             if (!set++ && lead)
14515                 PerlIO_printf(Perl_debug_log, "%s",lead);
14516             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14517         }
14518     }
14519     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14520             if (!set++ && lead) {
14521                 PerlIO_printf(Perl_debug_log, "%s",lead);
14522             }
14523             switch (cs) {
14524                 case REGEX_UNICODE_CHARSET:
14525                     PerlIO_printf(Perl_debug_log, "UNICODE");
14526                     break;
14527                 case REGEX_LOCALE_CHARSET:
14528                     PerlIO_printf(Perl_debug_log, "LOCALE");
14529                     break;
14530                 case REGEX_ASCII_RESTRICTED_CHARSET:
14531                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14532                     break;
14533                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14534                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14535                     break;
14536                 default:
14537                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14538                     break;
14539             }
14540     }
14541     if (lead)  {
14542         if (set)
14543             PerlIO_printf(Perl_debug_log, "\n");
14544         else
14545             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14546     }
14547 }
14548 #endif
14549
14550 void
14551 Perl_regdump(pTHX_ const regexp *r)
14552 {
14553 #ifdef DEBUGGING
14554     dVAR;
14555     SV * const sv = sv_newmortal();
14556     SV *dsv= sv_newmortal();
14557     RXi_GET_DECL(r,ri);
14558     GET_RE_DEBUG_FLAGS_DECL;
14559
14560     PERL_ARGS_ASSERT_REGDUMP;
14561
14562     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14563
14564     /* Header fields of interest. */
14565     if (r->anchored_substr) {
14566         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14567             RE_SV_DUMPLEN(r->anchored_substr), 30);
14568         PerlIO_printf(Perl_debug_log,
14569                       "anchored %s%s at %"IVdf" ",
14570                       s, RE_SV_TAIL(r->anchored_substr),
14571                       (IV)r->anchored_offset);
14572     } else if (r->anchored_utf8) {
14573         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14574             RE_SV_DUMPLEN(r->anchored_utf8), 30);
14575         PerlIO_printf(Perl_debug_log,
14576                       "anchored utf8 %s%s at %"IVdf" ",
14577                       s, RE_SV_TAIL(r->anchored_utf8),
14578                       (IV)r->anchored_offset);
14579     }
14580     if (r->float_substr) {
14581         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14582             RE_SV_DUMPLEN(r->float_substr), 30);
14583         PerlIO_printf(Perl_debug_log,
14584                       "floating %s%s at %"IVdf"..%"UVuf" ",
14585                       s, RE_SV_TAIL(r->float_substr),
14586                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14587     } else if (r->float_utf8) {
14588         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14589             RE_SV_DUMPLEN(r->float_utf8), 30);
14590         PerlIO_printf(Perl_debug_log,
14591                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14592                       s, RE_SV_TAIL(r->float_utf8),
14593                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14594     }
14595     if (r->check_substr || r->check_utf8)
14596         PerlIO_printf(Perl_debug_log,
14597                       (const char *)
14598                       (r->check_substr == r->float_substr
14599                        && r->check_utf8 == r->float_utf8
14600                        ? "(checking floating" : "(checking anchored"));
14601     if (r->extflags & RXf_NOSCAN)
14602         PerlIO_printf(Perl_debug_log, " noscan");
14603     if (r->extflags & RXf_CHECK_ALL)
14604         PerlIO_printf(Perl_debug_log, " isall");
14605     if (r->check_substr || r->check_utf8)
14606         PerlIO_printf(Perl_debug_log, ") ");
14607
14608     if (ri->regstclass) {
14609         regprop(r, sv, ri->regstclass);
14610         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14611     }
14612     if (r->extflags & RXf_ANCH) {
14613         PerlIO_printf(Perl_debug_log, "anchored");
14614         if (r->extflags & RXf_ANCH_BOL)
14615             PerlIO_printf(Perl_debug_log, "(BOL)");
14616         if (r->extflags & RXf_ANCH_MBOL)
14617             PerlIO_printf(Perl_debug_log, "(MBOL)");
14618         if (r->extflags & RXf_ANCH_SBOL)
14619             PerlIO_printf(Perl_debug_log, "(SBOL)");
14620         if (r->extflags & RXf_ANCH_GPOS)
14621             PerlIO_printf(Perl_debug_log, "(GPOS)");
14622         PerlIO_putc(Perl_debug_log, ' ');
14623     }
14624     if (r->extflags & RXf_GPOS_SEEN)
14625         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14626     if (r->intflags & PREGf_SKIP)
14627         PerlIO_printf(Perl_debug_log, "plus ");
14628     if (r->intflags & PREGf_IMPLICIT)
14629         PerlIO_printf(Perl_debug_log, "implicit ");
14630     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14631     if (r->extflags & RXf_EVAL_SEEN)
14632         PerlIO_printf(Perl_debug_log, "with eval ");
14633     PerlIO_printf(Perl_debug_log, "\n");
14634     DEBUG_FLAGS_r({
14635         regdump_extflags("r->extflags: ",r->extflags);
14636         regdump_intflags("r->intflags: ",r->intflags);
14637     });
14638 #else
14639     PERL_ARGS_ASSERT_REGDUMP;
14640     PERL_UNUSED_CONTEXT;
14641     PERL_UNUSED_ARG(r);
14642 #endif  /* DEBUGGING */
14643 }
14644
14645 /*
14646 - regprop - printable representation of opcode
14647 */
14648 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14649 STMT_START { \
14650         if (do_sep) {                           \
14651             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14652             if (flags & ANYOF_INVERT)           \
14653                 /*make sure the invert info is in each */ \
14654                 sv_catpvs(sv, "^");             \
14655             do_sep = 0;                         \
14656         }                                       \
14657 } STMT_END
14658
14659 void
14660 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14661 {
14662 #ifdef DEBUGGING
14663     dVAR;
14664     int k;
14665
14666     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14667     static const char * const anyofs[] = {
14668 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14669     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14670     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14671     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14672     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14673     || _CC_VERTSPACE != 16
14674   #error Need to adjust order of anyofs[]
14675 #endif
14676         "[\\w]",
14677         "[\\W]",
14678         "[\\d]",
14679         "[\\D]",
14680         "[:alpha:]",
14681         "[:^alpha:]",
14682         "[:lower:]",
14683         "[:^lower:]",
14684         "[:upper:]",
14685         "[:^upper:]",
14686         "[:punct:]",
14687         "[:^punct:]",
14688         "[:print:]",
14689         "[:^print:]",
14690         "[:alnum:]",
14691         "[:^alnum:]",
14692         "[:graph:]",
14693         "[:^graph:]",
14694         "[:cased:]",
14695         "[:^cased:]",
14696         "[\\s]",
14697         "[\\S]",
14698         "[:blank:]",
14699         "[:^blank:]",
14700         "[:xdigit:]",
14701         "[:^xdigit:]",
14702         "[:space:]",
14703         "[:^space:]",
14704         "[:cntrl:]",
14705         "[:^cntrl:]",
14706         "[:ascii:]",
14707         "[:^ascii:]",
14708         "[\\v]",
14709         "[\\V]"
14710     };
14711     RXi_GET_DECL(prog,progi);
14712     GET_RE_DEBUG_FLAGS_DECL;
14713
14714     PERL_ARGS_ASSERT_REGPROP;
14715
14716     sv_setpvs(sv, "");
14717
14718     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
14719         /* It would be nice to FAIL() here, but this may be called from
14720            regexec.c, and it would be hard to supply pRExC_state. */
14721         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14722     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14723
14724     k = PL_regkind[OP(o)];
14725
14726     if (k == EXACT) {
14727         sv_catpvs(sv, " ");
14728         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14729          * is a crude hack but it may be the best for now since
14730          * we have no flag "this EXACTish node was UTF-8"
14731          * --jhi */
14732         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14733                   PERL_PV_ESCAPE_UNI_DETECT |
14734                   PERL_PV_ESCAPE_NONASCII   |
14735                   PERL_PV_PRETTY_ELLIPSES   |
14736                   PERL_PV_PRETTY_LTGT       |
14737                   PERL_PV_PRETTY_NOCLEAR
14738                   );
14739     } else if (k == TRIE) {
14740         /* print the details of the trie in dumpuntil instead, as
14741          * progi->data isn't available here */
14742         const char op = OP(o);
14743         const U32 n = ARG(o);
14744         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14745                (reg_ac_data *)progi->data->data[n] :
14746                NULL;
14747         const reg_trie_data * const trie
14748             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14749
14750         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14751         DEBUG_TRIE_COMPILE_r(
14752             Perl_sv_catpvf(aTHX_ sv,
14753                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14754                 (UV)trie->startstate,
14755                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14756                 (UV)trie->wordcount,
14757                 (UV)trie->minlen,
14758                 (UV)trie->maxlen,
14759                 (UV)TRIE_CHARCOUNT(trie),
14760                 (UV)trie->uniquecharcount
14761             )
14762         );
14763         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14764             sv_catpvs(sv, "[");
14765             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
14766                                                    ? ANYOF_BITMAP(o)
14767                                                    : TRIE_BITMAP(trie));
14768             sv_catpvs(sv, "]");
14769         }
14770
14771     } else if (k == CURLY) {
14772         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14773             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14774         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14775     }
14776     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
14777         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14778     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14779         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
14780         if ( RXp_PAREN_NAMES(prog) ) {
14781             if ( k != REF || (OP(o) < NREF)) {
14782                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14783                 SV **name= av_fetch(list, ARG(o), 0 );
14784                 if (name)
14785                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14786             }
14787             else {
14788                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14789                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14790                 I32 *nums=(I32*)SvPVX(sv_dat);
14791                 SV **name= av_fetch(list, nums[0], 0 );
14792                 I32 n;
14793                 if (name) {
14794                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
14795                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14796                                     (n ? "," : ""), (IV)nums[n]);
14797                     }
14798                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14799                 }
14800             }
14801         }
14802     } else if (k == GOSUB)
14803         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14804     else if (k == VERB) {
14805         if (!o->flags)
14806             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14807                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14808     } else if (k == LOGICAL)
14809         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
14810     else if (k == ANYOF) {
14811         const U8 flags = ANYOF_FLAGS(o);
14812         int do_sep = 0;
14813
14814
14815         if (flags & ANYOF_LOCALE)
14816             sv_catpvs(sv, "{loc}");
14817         if (flags & ANYOF_LOC_FOLD)
14818             sv_catpvs(sv, "{i}");
14819         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14820         if (flags & ANYOF_INVERT)
14821             sv_catpvs(sv, "^");
14822
14823         /* output what the standard cp 0-255 bitmap matches */
14824         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
14825
14826         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14827         /* output any special charclass tests (used entirely under use locale) */
14828         if (ANYOF_CLASS_TEST_ANY_SET(o)) {
14829             int i;
14830             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
14831                 if (ANYOF_CLASS_TEST(o,i)) {
14832                     sv_catpv(sv, anyofs[i]);
14833                     do_sep = 1;
14834                 }
14835             }
14836         }
14837
14838         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14839
14840         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14841             sv_catpvs(sv, "{non-utf8-latin1-all}");
14842         }
14843
14844         /* output information about the unicode matching */
14845         if (flags & ANYOF_UNICODE_ALL)
14846             sv_catpvs(sv, "{unicode_all}");
14847         else if (ANYOF_NONBITMAP(o)) {
14848             SV *lv; /* Set if there is something outside the bit map. */
14849             SV * sw;
14850             bool byte_output = FALSE;   /* If something in the bitmap has been
14851                                            output */
14852
14853             if (flags & ANYOF_NONBITMAP_NON_UTF8) {
14854                 sv_catpvs(sv, "{outside bitmap}");
14855             }
14856             else {
14857                 sv_catpvs(sv, "{utf8}");
14858             }
14859
14860             /* Get the stuff that wasn't in the bitmap */
14861             sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14862             if (lv && lv != &PL_sv_undef) {
14863                 char *s = savesvpv(lv);
14864                 char * const origs = s;
14865
14866                 while (*s && *s != '\n')
14867                     s++;
14868
14869                 if (*s == '\n') {
14870                     const char * const t = ++s;
14871
14872                     if (byte_output) {
14873                         sv_catpvs(sv, " ");
14874                     }
14875
14876                     while (*s) {
14877                         if (*s == '\n') {
14878
14879                             /* Truncate very long output */
14880                             if (s - origs > 256) {
14881                                 Perl_sv_catpvf(aTHX_ sv,
14882                                                "%.*s...",
14883                                                (int) (s - origs - 1),
14884                                                t);
14885                                 goto out_dump;
14886                             }
14887                             *s = ' ';
14888                         }
14889                         else if (*s == '\t') {
14890                             *s = '-';
14891                         }
14892                         s++;
14893                     }
14894                     if (s[-1] == ' ')
14895                         s[-1] = 0;
14896
14897                     sv_catpv(sv, t);
14898                 }
14899
14900             out_dump:
14901
14902                 Safefree(origs);
14903                 SvREFCNT_dec_NN(lv);
14904             }
14905         }
14906
14907         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14908     }
14909     else if (k == POSIXD || k == NPOSIXD) {
14910         U8 index = FLAGS(o) * 2;
14911         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14912             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14913         }
14914         else {
14915             sv_catpv(sv, anyofs[index]);
14916         }
14917     }
14918     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14919         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14920 #else
14921     PERL_UNUSED_CONTEXT;
14922     PERL_UNUSED_ARG(sv);
14923     PERL_UNUSED_ARG(o);
14924     PERL_UNUSED_ARG(prog);
14925 #endif  /* DEBUGGING */
14926 }
14927
14928 SV *
14929 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14930 {                               /* Assume that RE_INTUIT is set */
14931     dVAR;
14932     struct regexp *const prog = ReANY(r);
14933     GET_RE_DEBUG_FLAGS_DECL;
14934
14935     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14936     PERL_UNUSED_CONTEXT;
14937
14938     DEBUG_COMPILE_r(
14939         {
14940             const char * const s = SvPV_nolen_const(prog->check_substr
14941                       ? prog->check_substr : prog->check_utf8);
14942
14943             if (!PL_colorset) reginitcolors();
14944             PerlIO_printf(Perl_debug_log,
14945                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14946                       PL_colors[4],
14947                       prog->check_substr ? "" : "utf8 ",
14948                       PL_colors[5],PL_colors[0],
14949                       s,
14950                       PL_colors[1],
14951                       (strlen(s) > 60 ? "..." : ""));
14952         } );
14953
14954     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14955 }
14956
14957 /*
14958    pregfree()
14959
14960    handles refcounting and freeing the perl core regexp structure. When
14961    it is necessary to actually free the structure the first thing it
14962    does is call the 'free' method of the regexp_engine associated to
14963    the regexp, allowing the handling of the void *pprivate; member
14964    first. (This routine is not overridable by extensions, which is why
14965    the extensions free is called first.)
14966
14967    See regdupe and regdupe_internal if you change anything here.
14968 */
14969 #ifndef PERL_IN_XSUB_RE
14970 void
14971 Perl_pregfree(pTHX_ REGEXP *r)
14972 {
14973     SvREFCNT_dec(r);
14974 }
14975
14976 void
14977 Perl_pregfree2(pTHX_ REGEXP *rx)
14978 {
14979     dVAR;
14980     struct regexp *const r = ReANY(rx);
14981     GET_RE_DEBUG_FLAGS_DECL;
14982
14983     PERL_ARGS_ASSERT_PREGFREE2;
14984
14985     if (r->mother_re) {
14986         ReREFCNT_dec(r->mother_re);
14987     } else {
14988         CALLREGFREE_PVT(rx); /* free the private data */
14989         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14990         Safefree(r->xpv_len_u.xpvlenu_pv);
14991     }
14992     if (r->substrs) {
14993         SvREFCNT_dec(r->anchored_substr);
14994         SvREFCNT_dec(r->anchored_utf8);
14995         SvREFCNT_dec(r->float_substr);
14996         SvREFCNT_dec(r->float_utf8);
14997         Safefree(r->substrs);
14998     }
14999     RX_MATCH_COPY_FREE(rx);
15000 #ifdef PERL_ANY_COW
15001     SvREFCNT_dec(r->saved_copy);
15002 #endif
15003     Safefree(r->offs);
15004     SvREFCNT_dec(r->qr_anoncv);
15005     rx->sv_u.svu_rx = 0;
15006 }
15007
15008 /*  reg_temp_copy()
15009
15010     This is a hacky workaround to the structural issue of match results
15011     being stored in the regexp structure which is in turn stored in
15012     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15013     could be PL_curpm in multiple contexts, and could require multiple
15014     result sets being associated with the pattern simultaneously, such
15015     as when doing a recursive match with (??{$qr})
15016
15017     The solution is to make a lightweight copy of the regexp structure
15018     when a qr// is returned from the code executed by (??{$qr}) this
15019     lightweight copy doesn't actually own any of its data except for
15020     the starp/end and the actual regexp structure itself.
15021
15022 */
15023
15024
15025 REGEXP *
15026 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15027 {
15028     struct regexp *ret;
15029     struct regexp *const r = ReANY(rx);
15030     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15031
15032     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15033
15034     if (!ret_x)
15035         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15036     else {
15037         SvOK_off((SV *)ret_x);
15038         if (islv) {
15039             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15040                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15041                made both spots point to the same regexp body.) */
15042             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15043             assert(!SvPVX(ret_x));
15044             ret_x->sv_u.svu_rx = temp->sv_any;
15045             temp->sv_any = NULL;
15046             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15047             SvREFCNT_dec_NN(temp);
15048             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15049                ing below will not set it. */
15050             SvCUR_set(ret_x, SvCUR(rx));
15051         }
15052     }
15053     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15054        sv_force_normal(sv) is called.  */
15055     SvFAKE_on(ret_x);
15056     ret = ReANY(ret_x);
15057
15058     SvFLAGS(ret_x) |= SvUTF8(rx);
15059     /* We share the same string buffer as the original regexp, on which we
15060        hold a reference count, incremented when mother_re is set below.
15061        The string pointer is copied here, being part of the regexp struct.
15062      */
15063     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15064            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15065     if (r->offs) {
15066         const I32 npar = r->nparens+1;
15067         Newx(ret->offs, npar, regexp_paren_pair);
15068         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15069     }
15070     if (r->substrs) {
15071         Newx(ret->substrs, 1, struct reg_substr_data);
15072         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15073
15074         SvREFCNT_inc_void(ret->anchored_substr);
15075         SvREFCNT_inc_void(ret->anchored_utf8);
15076         SvREFCNT_inc_void(ret->float_substr);
15077         SvREFCNT_inc_void(ret->float_utf8);
15078
15079         /* check_substr and check_utf8, if non-NULL, point to either their
15080            anchored or float namesakes, and don't hold a second reference.  */
15081     }
15082     RX_MATCH_COPIED_off(ret_x);
15083 #ifdef PERL_ANY_COW
15084     ret->saved_copy = NULL;
15085 #endif
15086     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15087     SvREFCNT_inc_void(ret->qr_anoncv);
15088
15089     return ret_x;
15090 }
15091 #endif
15092
15093 /* regfree_internal()
15094
15095    Free the private data in a regexp. This is overloadable by
15096    extensions. Perl takes care of the regexp structure in pregfree(),
15097    this covers the *pprivate pointer which technically perl doesn't
15098    know about, however of course we have to handle the
15099    regexp_internal structure when no extension is in use.
15100
15101    Note this is called before freeing anything in the regexp
15102    structure.
15103  */
15104
15105 void
15106 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15107 {
15108     dVAR;
15109     struct regexp *const r = ReANY(rx);
15110     RXi_GET_DECL(r,ri);
15111     GET_RE_DEBUG_FLAGS_DECL;
15112
15113     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15114
15115     DEBUG_COMPILE_r({
15116         if (!PL_colorset)
15117             reginitcolors();
15118         {
15119             SV *dsv= sv_newmortal();
15120             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15121                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15122             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15123                 PL_colors[4],PL_colors[5],s);
15124         }
15125     });
15126 #ifdef RE_TRACK_PATTERN_OFFSETS
15127     if (ri->u.offsets)
15128         Safefree(ri->u.offsets);             /* 20010421 MJD */
15129 #endif
15130     if (ri->code_blocks) {
15131         int n;
15132         for (n = 0; n < ri->num_code_blocks; n++)
15133             SvREFCNT_dec(ri->code_blocks[n].src_regex);
15134         Safefree(ri->code_blocks);
15135     }
15136
15137     if (ri->data) {
15138         int n = ri->data->count;
15139
15140         while (--n >= 0) {
15141           /* If you add a ->what type here, update the comment in regcomp.h */
15142             switch (ri->data->what[n]) {
15143             case 'a':
15144             case 'r':
15145             case 's':
15146             case 'S':
15147             case 'u':
15148                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15149                 break;
15150             case 'f':
15151                 Safefree(ri->data->data[n]);
15152                 break;
15153             case 'l':
15154             case 'L':
15155                 break;
15156             case 'T':
15157                 { /* Aho Corasick add-on structure for a trie node.
15158                      Used in stclass optimization only */
15159                     U32 refcount;
15160                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15161                     OP_REFCNT_LOCK;
15162                     refcount = --aho->refcount;
15163                     OP_REFCNT_UNLOCK;
15164                     if ( !refcount ) {
15165                         PerlMemShared_free(aho->states);
15166                         PerlMemShared_free(aho->fail);
15167                          /* do this last!!!! */
15168                         PerlMemShared_free(ri->data->data[n]);
15169                         PerlMemShared_free(ri->regstclass);
15170                     }
15171                 }
15172                 break;
15173             case 't':
15174                 {
15175                     /* trie structure. */
15176                     U32 refcount;
15177                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15178                     OP_REFCNT_LOCK;
15179                     refcount = --trie->refcount;
15180                     OP_REFCNT_UNLOCK;
15181                     if ( !refcount ) {
15182                         PerlMemShared_free(trie->charmap);
15183                         PerlMemShared_free(trie->states);
15184                         PerlMemShared_free(trie->trans);
15185                         if (trie->bitmap)
15186                             PerlMemShared_free(trie->bitmap);
15187                         if (trie->jump)
15188                             PerlMemShared_free(trie->jump);
15189                         PerlMemShared_free(trie->wordinfo);
15190                         /* do this last!!!! */
15191                         PerlMemShared_free(ri->data->data[n]);
15192                     }
15193                 }
15194                 break;
15195             default:
15196                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15197             }
15198         }
15199         Safefree(ri->data->what);
15200         Safefree(ri->data);
15201     }
15202
15203     Safefree(ri);
15204 }
15205
15206 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15207 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15208 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
15209
15210 /*
15211    re_dup - duplicate a regexp.
15212
15213    This routine is expected to clone a given regexp structure. It is only
15214    compiled under USE_ITHREADS.
15215
15216    After all of the core data stored in struct regexp is duplicated
15217    the regexp_engine.dupe method is used to copy any private data
15218    stored in the *pprivate pointer. This allows extensions to handle
15219    any duplication it needs to do.
15220
15221    See pregfree() and regfree_internal() if you change anything here.
15222 */
15223 #if defined(USE_ITHREADS)
15224 #ifndef PERL_IN_XSUB_RE
15225 void
15226 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15227 {
15228     dVAR;
15229     I32 npar;
15230     const struct regexp *r = ReANY(sstr);
15231     struct regexp *ret = ReANY(dstr);
15232
15233     PERL_ARGS_ASSERT_RE_DUP_GUTS;
15234
15235     npar = r->nparens+1;
15236     Newx(ret->offs, npar, regexp_paren_pair);
15237     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15238
15239     if (ret->substrs) {
15240         /* Do it this way to avoid reading from *r after the StructCopy().
15241            That way, if any of the sv_dup_inc()s dislodge *r from the L1
15242            cache, it doesn't matter.  */
15243         const bool anchored = r->check_substr
15244             ? r->check_substr == r->anchored_substr
15245             : r->check_utf8 == r->anchored_utf8;
15246         Newx(ret->substrs, 1, struct reg_substr_data);
15247         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15248
15249         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15250         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15251         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15252         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15253
15254         /* check_substr and check_utf8, if non-NULL, point to either their
15255            anchored or float namesakes, and don't hold a second reference.  */
15256
15257         if (ret->check_substr) {
15258             if (anchored) {
15259                 assert(r->check_utf8 == r->anchored_utf8);
15260                 ret->check_substr = ret->anchored_substr;
15261                 ret->check_utf8 = ret->anchored_utf8;
15262             } else {
15263                 assert(r->check_substr == r->float_substr);
15264                 assert(r->check_utf8 == r->float_utf8);
15265                 ret->check_substr = ret->float_substr;
15266                 ret->check_utf8 = ret->float_utf8;
15267             }
15268         } else if (ret->check_utf8) {
15269             if (anchored) {
15270                 ret->check_utf8 = ret->anchored_utf8;
15271             } else {
15272                 ret->check_utf8 = ret->float_utf8;
15273             }
15274         }
15275     }
15276
15277     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15278     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15279
15280     if (ret->pprivate)
15281         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15282
15283     if (RX_MATCH_COPIED(dstr))
15284         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15285     else
15286         ret->subbeg = NULL;
15287 #ifdef PERL_ANY_COW
15288     ret->saved_copy = NULL;
15289 #endif
15290
15291     /* Whether mother_re be set or no, we need to copy the string.  We
15292        cannot refrain from copying it when the storage points directly to
15293        our mother regexp, because that's
15294                1: a buffer in a different thread
15295                2: something we no longer hold a reference on
15296                so we need to copy it locally.  */
15297     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15298     ret->mother_re   = NULL;
15299 }
15300 #endif /* PERL_IN_XSUB_RE */
15301
15302 /*
15303    regdupe_internal()
15304
15305    This is the internal complement to regdupe() which is used to copy
15306    the structure pointed to by the *pprivate pointer in the regexp.
15307    This is the core version of the extension overridable cloning hook.
15308    The regexp structure being duplicated will be copied by perl prior
15309    to this and will be provided as the regexp *r argument, however
15310    with the /old/ structures pprivate pointer value. Thus this routine
15311    may override any copying normally done by perl.
15312
15313    It returns a pointer to the new regexp_internal structure.
15314 */
15315
15316 void *
15317 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15318 {
15319     dVAR;
15320     struct regexp *const r = ReANY(rx);
15321     regexp_internal *reti;
15322     int len;
15323     RXi_GET_DECL(r,ri);
15324
15325     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15326
15327     len = ProgLen(ri);
15328
15329     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15330     Copy(ri->program, reti->program, len+1, regnode);
15331
15332     reti->num_code_blocks = ri->num_code_blocks;
15333     if (ri->code_blocks) {
15334         int n;
15335         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15336                 struct reg_code_block);
15337         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15338                 struct reg_code_block);
15339         for (n = 0; n < ri->num_code_blocks; n++)
15340              reti->code_blocks[n].src_regex = (REGEXP*)
15341                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15342     }
15343     else
15344         reti->code_blocks = NULL;
15345
15346     reti->regstclass = NULL;
15347
15348     if (ri->data) {
15349         struct reg_data *d;
15350         const int count = ri->data->count;
15351         int i;
15352
15353         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15354                 char, struct reg_data);
15355         Newx(d->what, count, U8);
15356
15357         d->count = count;
15358         for (i = 0; i < count; i++) {
15359             d->what[i] = ri->data->what[i];
15360             switch (d->what[i]) {
15361                 /* see also regcomp.h and regfree_internal() */
15362             case 'a': /* actually an AV, but the dup function is identical.  */
15363             case 'r':
15364             case 's':
15365             case 'S':
15366             case 'u': /* actually an HV, but the dup function is identical.  */
15367                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15368                 break;
15369             case 'f':
15370                 /* This is cheating. */
15371                 Newx(d->data[i], 1, struct regnode_charclass_class);
15372                 StructCopy(ri->data->data[i], d->data[i],
15373                             struct regnode_charclass_class);
15374                 reti->regstclass = (regnode*)d->data[i];
15375                 break;
15376             case 'T':
15377                 /* Trie stclasses are readonly and can thus be shared
15378                  * without duplication. We free the stclass in pregfree
15379                  * when the corresponding reg_ac_data struct is freed.
15380                  */
15381                 reti->regstclass= ri->regstclass;
15382                 /* Fall through */
15383             case 't':
15384                 OP_REFCNT_LOCK;
15385                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15386                 OP_REFCNT_UNLOCK;
15387                 /* Fall through */
15388             case 'l':
15389             case 'L':
15390                 d->data[i] = ri->data->data[i];
15391                 break;
15392             default:
15393                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15394             }
15395         }
15396
15397         reti->data = d;
15398     }
15399     else
15400         reti->data = NULL;
15401
15402     reti->name_list_idx = ri->name_list_idx;
15403
15404 #ifdef RE_TRACK_PATTERN_OFFSETS
15405     if (ri->u.offsets) {
15406         Newx(reti->u.offsets, 2*len+1, U32);
15407         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15408     }
15409 #else
15410     SetProgLen(reti,len);
15411 #endif
15412
15413     return (void*)reti;
15414 }
15415
15416 #endif    /* USE_ITHREADS */
15417
15418 #ifndef PERL_IN_XSUB_RE
15419
15420 /*
15421  - regnext - dig the "next" pointer out of a node
15422  */
15423 regnode *
15424 Perl_regnext(pTHX_ regnode *p)
15425 {
15426     dVAR;
15427     I32 offset;
15428
15429     if (!p)
15430         return(NULL);
15431
15432     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15433         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15434     }
15435
15436     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15437     if (offset == 0)
15438         return(NULL);
15439
15440     return(p+offset);
15441 }
15442 #endif
15443
15444 STATIC void
15445 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15446 {
15447     va_list args;
15448     STRLEN l1 = strlen(pat1);
15449     STRLEN l2 = strlen(pat2);
15450     char buf[512];
15451     SV *msv;
15452     const char *message;
15453
15454     PERL_ARGS_ASSERT_RE_CROAK2;
15455
15456     if (l1 > 510)
15457         l1 = 510;
15458     if (l1 + l2 > 510)
15459         l2 = 510 - l1;
15460     Copy(pat1, buf, l1 , char);
15461     Copy(pat2, buf + l1, l2 , char);
15462     buf[l1 + l2] = '\n';
15463     buf[l1 + l2 + 1] = '\0';
15464 #ifdef I_STDARG
15465     /* ANSI variant takes additional second argument */
15466     va_start(args, pat2);
15467 #else
15468     va_start(args);
15469 #endif
15470     msv = vmess(buf, &args);
15471     va_end(args);
15472     message = SvPV_const(msv,l1);
15473     if (l1 > 512)
15474         l1 = 512;
15475     Copy(message, buf, l1 , char);
15476     buf[l1-1] = '\0';                   /* Overwrite \n */
15477     Perl_croak(aTHX_ "%s", buf);
15478 }
15479
15480 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15481
15482 #ifndef PERL_IN_XSUB_RE
15483 void
15484 Perl_save_re_context(pTHX)
15485 {
15486     dVAR;
15487
15488     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15489     if (PL_curpm) {
15490         const REGEXP * const rx = PM_GETRE(PL_curpm);
15491         if (rx) {
15492             U32 i;
15493             for (i = 1; i <= RX_NPARENS(rx); i++) {
15494                 char digits[TYPE_CHARS(long)];
15495                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15496                 GV *const *const gvp
15497                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15498
15499                 if (gvp) {
15500                     GV * const gv = *gvp;
15501                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15502                         save_scalar(gv);
15503                 }
15504             }
15505         }
15506     }
15507 }
15508 #endif
15509
15510 #ifdef DEBUGGING
15511
15512 STATIC void
15513 S_put_byte(pTHX_ SV *sv, int c)
15514 {
15515     PERL_ARGS_ASSERT_PUT_BYTE;
15516
15517     /* Our definition of isPRINT() ignores locales, so only bytes that are
15518        not part of UTF-8 are considered printable. I assume that the same
15519        holds for UTF-EBCDIC.
15520        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15521        which Wikipedia says:
15522
15523        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15524        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15525        identical, to the ASCII delete (DEL) or rubout control character. ...
15526        it is typically mapped to hexadecimal code 9F, in order to provide a
15527        unique character mapping in both directions)
15528
15529        So the old condition can be simplified to !isPRINT(c)  */
15530     if (!isPRINT(c)) {
15531         switch (c) {
15532             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
15533             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
15534             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
15535             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
15536             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
15537
15538             default:
15539                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15540                 break;
15541         }
15542     }
15543     else {
15544         const char string = c;
15545         if (c == '-' || c == ']' || c == '\\' || c == '^')
15546             sv_catpvs(sv, "\\");
15547         sv_catpvn(sv, &string, 1);
15548     }
15549 }
15550
15551 STATIC bool
15552 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
15553 {
15554     /* Appends to 'sv' a displayable version of the innards of the bracketed
15555      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
15556      * output anything */
15557
15558     int i;
15559     int rangestart = -1;
15560     bool has_output_anything = FALSE;
15561
15562     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
15563
15564     for (i = 0; i <= 256; i++) {
15565         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
15566             if (rangestart == -1)
15567                 rangestart = i;
15568         } else if (rangestart != -1) {
15569             int j = i - 1;
15570             if (i <= rangestart + 3) {  /* Individual chars in short ranges */
15571                 for (; rangestart < i; rangestart++)
15572                     put_byte(sv, rangestart);
15573             }
15574             else if (   j > 255
15575                      || ! isALPHANUMERIC(rangestart)
15576                      || ! isALPHANUMERIC(j)
15577                      || isDIGIT(rangestart) != isDIGIT(j)
15578                      || isUPPER(rangestart) != isUPPER(j)
15579                      || isLOWER(rangestart) != isLOWER(j)
15580
15581                         /* This final test should get optimized out except
15582                          * on EBCDIC platforms, where it causes ranges that
15583                          * cross discontinuities like i/j to be shown as hex
15584                          * instead of the misleading, e.g. H-K (since that
15585                          * range includes more than H, I, J, K). */
15586                      || (j - rangestart)
15587                          != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
15588             {
15589                 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
15590                                rangestart,
15591                                (j < 256) ? j : 255);
15592             }
15593             else { /* Here, the ends of the range are both digits, or both
15594                       uppercase, or both lowercase; and there's no
15595                       discontinuity in the range (which could happen on EBCDIC
15596                       platforms) */
15597                 put_byte(sv, rangestart);
15598                 sv_catpvs(sv, "-");
15599                 put_byte(sv, j);
15600             }
15601             rangestart = -1;
15602             has_output_anything = TRUE;
15603         }
15604     }
15605
15606     return has_output_anything;
15607 }
15608
15609 #define CLEAR_OPTSTART \
15610     if (optstart) STMT_START { \
15611             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15612             optstart=NULL; \
15613     } STMT_END
15614
15615 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15616
15617 STATIC const regnode *
15618 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15619             const regnode *last, const regnode *plast,
15620             SV* sv, I32 indent, U32 depth)
15621 {
15622     dVAR;
15623     U8 op = PSEUDO;     /* Arbitrary non-END op. */
15624     const regnode *next;
15625     const regnode *optstart= NULL;
15626
15627     RXi_GET_DECL(r,ri);
15628     GET_RE_DEBUG_FLAGS_DECL;
15629
15630     PERL_ARGS_ASSERT_DUMPUNTIL;
15631
15632 #ifdef DEBUG_DUMPUNTIL
15633     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15634         last ? last-start : 0,plast ? plast-start : 0);
15635 #endif
15636
15637     if (plast && plast < last)
15638         last= plast;
15639
15640     while (PL_regkind[op] != END && (!last || node < last)) {
15641         /* While that wasn't END last time... */
15642         NODE_ALIGN(node);
15643         op = OP(node);
15644         if (op == CLOSE || op == WHILEM)
15645             indent--;
15646         next = regnext((regnode *)node);
15647
15648         /* Where, what. */
15649         if (OP(node) == OPTIMIZED) {
15650             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15651                 optstart = node;
15652             else
15653                 goto after_print;
15654         } else
15655             CLEAR_OPTSTART;
15656
15657         regprop(r, sv, node);
15658         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15659                       (int)(2*indent + 1), "", SvPVX_const(sv));
15660
15661         if (OP(node) != OPTIMIZED) {
15662             if (next == NULL)           /* Next ptr. */
15663                 PerlIO_printf(Perl_debug_log, " (0)");
15664             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15665                 PerlIO_printf(Perl_debug_log, " (FAIL)");
15666             else
15667                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15668             (void)PerlIO_putc(Perl_debug_log, '\n');
15669         }
15670
15671       after_print:
15672         if (PL_regkind[(U8)op] == BRANCHJ) {
15673             assert(next);
15674             {
15675                 const regnode *nnode = (OP(next) == LONGJMP
15676                                        ? regnext((regnode *)next)
15677                                        : next);
15678                 if (last && nnode > last)
15679                     nnode = last;
15680                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15681             }
15682         }
15683         else if (PL_regkind[(U8)op] == BRANCH) {
15684             assert(next);
15685             DUMPUNTIL(NEXTOPER(node), next);
15686         }
15687         else if ( PL_regkind[(U8)op]  == TRIE ) {
15688             const regnode *this_trie = node;
15689             const char op = OP(node);
15690             const U32 n = ARG(node);
15691             const reg_ac_data * const ac = op>=AHOCORASICK ?
15692                (reg_ac_data *)ri->data->data[n] :
15693                NULL;
15694             const reg_trie_data * const trie =
15695                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15696 #ifdef DEBUGGING
15697             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15698 #endif
15699             const regnode *nextbranch= NULL;
15700             I32 word_idx;
15701             sv_setpvs(sv, "");
15702             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15703                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15704
15705                 PerlIO_printf(Perl_debug_log, "%*s%s ",
15706                    (int)(2*(indent+3)), "",
15707                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15708                             PL_colors[0], PL_colors[1],
15709                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15710                             PERL_PV_PRETTY_ELLIPSES    |
15711                             PERL_PV_PRETTY_LTGT
15712                             )
15713                             : "???"
15714                 );
15715                 if (trie->jump) {
15716                     U16 dist= trie->jump[word_idx+1];
15717                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15718                                   (UV)((dist ? this_trie + dist : next) - start));
15719                     if (dist) {
15720                         if (!nextbranch)
15721                             nextbranch= this_trie + trie->jump[0];
15722                         DUMPUNTIL(this_trie + dist, nextbranch);
15723                     }
15724                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15725                         nextbranch= regnext((regnode *)nextbranch);
15726                 } else {
15727                     PerlIO_printf(Perl_debug_log, "\n");
15728                 }
15729             }
15730             if (last && next > last)
15731                 node= last;
15732             else
15733                 node= next;
15734         }
15735         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15736             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15737                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15738         }
15739         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15740             assert(next);
15741             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15742         }
15743         else if ( op == PLUS || op == STAR) {
15744             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15745         }
15746         else if (PL_regkind[(U8)op] == ANYOF) {
15747             /* arglen 1 + class block */
15748             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15749                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15750             node = NEXTOPER(node);
15751         }
15752         else if (PL_regkind[(U8)op] == EXACT) {
15753             /* Literal string, where present. */
15754             node += NODE_SZ_STR(node) - 1;
15755             node = NEXTOPER(node);
15756         }
15757         else {
15758             node = NEXTOPER(node);
15759             node += regarglen[(U8)op];
15760         }
15761         if (op == CURLYX || op == OPEN)
15762             indent++;
15763     }
15764     CLEAR_OPTSTART;
15765 #ifdef DEBUG_DUMPUNTIL
15766     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15767 #endif
15768     return node;
15769 }
15770
15771 #endif  /* DEBUGGING */
15772
15773 /*
15774  * Local variables:
15775  * c-indentation-style: bsd
15776  * c-basic-offset: 4
15777  * indent-tabs-mode: nil
15778  * End:
15779  *
15780  * ex: set ts=8 sts=4 sw=4 et:
15781  */