src/5017011/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     I32         whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 130     I32         naughty;                /* How bad is this pattern? */
 131     I32         sawback;                /* Did we see \1, ...? */
 132     U32         seen;
 133     I32         size;                   /* Code size. */
 134     I32         npar;                   /* Capture buffer count, (OPEN). */
 135     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 136     I32         nestroot;               /* root parens we are in - used by accept */
 137     I32         extralen;
 138     I32         seen_zerolen;
 139     regnode     **open_parens;          /* pointers to open parens */
 140     regnode     **close_parens;         /* pointers to close parens */
 141     regnode     *opend;                 /* END node in program */
 142     I32         utf8;           /* whether the pattern is utf8 or not */
 143     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 144                                 /* XXX use this for future optimisation of case
 145                                  * where pattern must be upgraded to utf8. */
 146     I32         uni_semantics;  /* If a d charset modifier should use unicode
 147                                    rules, even if the pattern is not in
 148                                    utf8 */
 149     HV          *paren_names;           /* Paren names */
 150
 151     regnode     **recurse;              /* Recurse regops */
 152     I32         recurse_count;          /* Number of recurse regops */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         override_recoding;
 156     I32         in_multi_char_class;
 157     struct reg_code_block *code_blocks; /* positions of literal (?{})
 158                                             within pattern */
 159     int         num_code_blocks;        /* size of code_blocks[] */
 160     int         code_index;             /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162     char        *starttry;              /* -Dr: where regtry was called. */
 163 #define RExC_starttry   (pRExC_state->starttry)
 164 #endif
 165     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167     const char  *lastparse;
 168     I32         lastnum;
 169     AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse  (pRExC_state->lastparse)
 171 #define RExC_lastnum    (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags      (pRExC_state->flags)
 177 #define RExC_pm_flags   (pRExC_state->pm_flags)
 178 #define RExC_precomp    (pRExC_state->precomp)
 179 #define RExC_rx_sv      (pRExC_state->rx_sv)
 180 #define RExC_rx         (pRExC_state->rx)
 181 #define RExC_rxi        (pRExC_state->rxi)
 182 #define RExC_start      (pRExC_state->start)
 183 #define RExC_end        (pRExC_state->end)
 184 #define RExC_parse      (pRExC_state->parse)
 185 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit       (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty    (pRExC_state->naughty)
 193 #define RExC_sawback    (pRExC_state->sawback)
 194 #define RExC_seen       (pRExC_state->seen)
 195 #define RExC_size       (pRExC_state->size)
 196 #define RExC_npar       (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen   (pRExC_state->extralen)
 199 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 200 #define RExC_utf8       (pRExC_state->utf8)
 201 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 203 #define RExC_open_parens        (pRExC_state->open_parens)
 204 #define RExC_close_parens       (pRExC_state->close_parens)
 205 #define RExC_opend      (pRExC_state->opend)
 206 #define RExC_paren_names        (pRExC_state->paren_names)
 207 #define RExC_recurse    (pRExC_state->recurse)
 208 #define RExC_recurse_count      (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale    (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217         ((*s) == '{' && regcurly(s, FALSE)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART          /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST           0       /* Worst case. */
 226 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE          0x02
 233 #define SPSTART         0x04    /* Starts with * or + */
 234 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 235 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 236 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 237
 238 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 239
 240 /* whether trie related optimizations are enabled */
 241 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 242 #define TRIE_STUDY_OPT
 243 #define FULL_TRIE_STUDY
 244 #define TRIE_STCLASS
 245 #endif
 246
 247
 248
 249 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 250 #define PBITVAL(paren) (1 << ((paren) & 7))
 251 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 252 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 253 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 254
 255 #define REQUIRE_UTF8    STMT_START {                                       \
 256                                      if (!UTF) {                           \
 257                                          *flagp = RESTART_UTF8;            \
 258                                          return NULL;                      \
 259                                      }                                     \
 260                         } STMT_END
 261
 262 /* This converts the named class defined in regcomp.h to its equivalent class
 263  * number defined in handy.h. */
 264 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 265 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 266
 267 /* About scan_data_t.
 268
 269   During optimisation we recurse through the regexp program performing
 270   various inplace (keyhole style) optimisations. In addition study_chunk
 271   and scan_commit populate this data structure with information about
 272   what strings MUST appear in the pattern. We look for the longest
 273   string that must appear at a fixed location, and we look for the
 274   longest string that may appear at a floating location. So for instance
 275   in the pattern:
 276
 277     /FOO[xX]A.*B[xX]BAR/
 278
 279   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 280   strings (because they follow a .* construct). study_chunk will identify
 281   both FOO and BAR as being the longest fixed and floating strings respectively.
 282
 283   The strings can be composites, for instance
 284
 285      /(f)(o)(o)/
 286
 287   will result in a composite fixed substring 'foo'.
 288
 289   For each string some basic information is maintained:
 290
 291   - offset or min_offset
 292     This is the position the string must appear at, or not before.
 293     It also implicitly (when combined with minlenp) tells us how many
 294     characters must match before the string we are searching for.
 295     Likewise when combined with minlenp and the length of the string it
 296     tells us how many characters must appear after the string we have
 297     found.
 298
 299   - max_offset
 300     Only used for floating strings. This is the rightmost point that
 301     the string can appear at. If set to I32 max it indicates that the
 302     string can occur infinitely far to the right.
 303
 304   - minlenp
 305     A pointer to the minimum number of characters of the pattern that the
 306     string was found inside. This is important as in the case of positive
 307     lookahead or positive lookbehind we can have multiple patterns
 308     involved. Consider
 309
 310     /(?=FOO).*F/
 311
 312     The minimum length of the pattern overall is 3, the minimum length
 313     of the lookahead part is 3, but the minimum length of the part that
 314     will actually match is 1. So 'FOO's minimum length is 3, but the
 315     minimum length for the F is 1. This is important as the minimum length
 316     is used to determine offsets in front of and behind the string being
 317     looked for.  Since strings can be composites this is the length of the
 318     pattern at the time it was committed with a scan_commit. Note that
 319     the length is calculated by study_chunk, so that the minimum lengths
 320     are not known until the full pattern has been compiled, thus the
 321     pointer to the value.
 322
 323   - lookbehind
 324
 325     In the case of lookbehind the string being searched for can be
 326     offset past the start point of the final matching string.
 327     If this value was just blithely removed from the min_offset it would
 328     invalidate some of the calculations for how many chars must match
 329     before or after (as they are derived from min_offset and minlen and
 330     the length of the string being searched for).
 331     When the final pattern is compiled and the data is moved from the
 332     scan_data_t structure into the regexp structure the information
 333     about lookbehind is factored in, with the information that would
 334     have been lost precalculated in the end_shift field for the
 335     associated string.
 336
 337   The fields pos_min and pos_delta are used to store the minimum offset
 338   and the delta to the maximum offset at the current point in the pattern.
 339
 340 */
 341
 342 typedef struct scan_data_t {
 343     /*I32 len_min;      unused */
 344     /*I32 len_delta;    unused */
 345     I32 pos_min;
 346     I32 pos_delta;
 347     SV *last_found;
 348     I32 last_end;           /* min value, <0 unless valid. */
 349     I32 last_start_min;
 350     I32 last_start_max;
 351     SV **longest;           /* Either &l_fixed, or &l_float. */
 352     SV *longest_fixed;      /* longest fixed string found in pattern */
 353     I32 offset_fixed;       /* offset where it starts */
 354     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 355     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 356     SV *longest_float;      /* longest floating string found in pattern */
 357     I32 offset_float_min;   /* earliest point in string it can appear */
 358     I32 offset_float_max;   /* latest point in string it can appear */
 359     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 360     I32 lookbehind_float;   /* is the position of the string modified by LB */
 361     I32 flags;
 362     I32 whilem_c;
 363     I32 *last_closep;
 364     struct regnode_charclass_class *start_class;
 365 } scan_data_t;
 366
 367 /*
 368  * Forward declarations for pregcomp()'s friends.
 369  */
 370
 371 static const scan_data_t zero_scan_data =
 372   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 373
 374 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 375 #define SF_BEFORE_SEOL          0x0001
 376 #define SF_BEFORE_MEOL          0x0002
 377 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 378 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 379
 380 #ifdef NO_UNARY_PLUS
 381 #  define SF_FIX_SHIFT_EOL      (0+2)
 382 #  define SF_FL_SHIFT_EOL               (0+4)
 383 #else
 384 #  define SF_FIX_SHIFT_EOL      (+2)
 385 #  define SF_FL_SHIFT_EOL               (+4)
 386 #endif
 387
 388 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 389 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 390
 391 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 392 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 393 #define SF_IS_INF               0x0040
 394 #define SF_HAS_PAR              0x0080
 395 #define SF_IN_PAR               0x0100
 396 #define SF_HAS_EVAL             0x0200
 397 #define SCF_DO_SUBSTR           0x0400
 398 #define SCF_DO_STCLASS_AND      0x0800
 399 #define SCF_DO_STCLASS_OR       0x1000
 400 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 401 #define SCF_WHILEM_VISITED_POS  0x2000
 402
 403 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 404 #define SCF_SEEN_ACCEPT         0x8000
 405
 406 #define UTF cBOOL(RExC_utf8)
 407
 408 /* The enums for all these are ordered so things work out correctly */
 409 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 410 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 411 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 412 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 413 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 414 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 415 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 416
 417 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 418
 419 #define OOB_NAMEDCLASS          -1
 420
 421 /* There is no code point that is out-of-bounds, so this is problematic.  But
 422  * its only current use is to initialize a variable that is always set before
 423  * looked at. */
 424 #define OOB_UNICODE             0xDEADBEEF
 425
 426 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 427 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 428
 429
 430 /* length of regex to show in messages that don't mark a position within */
 431 #define RegexLengthToShowInErrorMessages 127
 432
 433 /*
 434  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 435  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 436  * op/pragma/warn/regcomp.
 437  */
 438 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 439 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 440
 441 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 442
 443 /*
 444  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 445  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 446  * "...".
 447  */
 448 #define _FAIL(code) STMT_START {                                        \
 449     const char *ellipses = "";                                          \
 450     IV len = RExC_end - RExC_precomp;                                   \
 451                                                                         \
 452     if (!SIZE_ONLY)                                                     \
 453         SAVEFREESV(RExC_rx_sv);                                         \
 454     if (len > RegexLengthToShowInErrorMessages) {                       \
 455         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 456         len = RegexLengthToShowInErrorMessages - 10;                    \
 457         ellipses = "...";                                               \
 458     }                                                                   \
 459     code;                                                               \
 460 } STMT_END
 461
 462 #define FAIL(msg) _FAIL(                            \
 463     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 464             msg, (int)len, RExC_precomp, ellipses))
 465
 466 #define FAIL2(msg,arg) _FAIL(                       \
 467     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 468             arg, (int)len, RExC_precomp, ellipses))
 469
 470 /*
 471  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 472  */
 473 #define Simple_vFAIL(m) STMT_START {                                    \
 474     const IV offset = RExC_parse - RExC_precomp;                        \
 475     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 476             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 477 } STMT_END
 478
 479 /*
 480  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 481  */
 482 #define vFAIL(m) STMT_START {                           \
 483     if (!SIZE_ONLY)                                     \
 484         SAVEFREESV(RExC_rx_sv);                         \
 485     Simple_vFAIL(m);                                    \
 486 } STMT_END
 487
 488 /*
 489  * Like Simple_vFAIL(), but accepts two arguments.
 490  */
 491 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 492     const IV offset = RExC_parse - RExC_precomp;                        \
 493     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 494             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 495 } STMT_END
 496
 497 /*
 498  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 499  */
 500 #define vFAIL2(m,a1) STMT_START {                       \
 501     if (!SIZE_ONLY)                                     \
 502         SAVEFREESV(RExC_rx_sv);                         \
 503     Simple_vFAIL2(m, a1);                               \
 504 } STMT_END
 505
 506
 507 /*
 508  * Like Simple_vFAIL(), but accepts three arguments.
 509  */
 510 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 511     const IV offset = RExC_parse - RExC_precomp;                \
 512     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 513             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 514 } STMT_END
 515
 516 /*
 517  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 518  */
 519 #define vFAIL3(m,a1,a2) STMT_START {                    \
 520     if (!SIZE_ONLY)                                     \
 521         SAVEFREESV(RExC_rx_sv);                         \
 522     Simple_vFAIL3(m, a1, a2);                           \
 523 } STMT_END
 524
 525 /*
 526  * Like Simple_vFAIL(), but accepts four arguments.
 527  */
 528 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 529     const IV offset = RExC_parse - RExC_precomp;                \
 530     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 531             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 532 } STMT_END
 533
 534 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 535     if (!SIZE_ONLY)                                     \
 536         SAVEFREESV(RExC_rx_sv);                         \
 537     Simple_vFAIL4(m, a1, a2, a3);                       \
 538 } STMT_END
 539
 540 /* m is not necessarily a "literal string", in this macro */
 541 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 542     const IV offset = loc - RExC_precomp;                               \
 543     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 544             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 545 } STMT_END
 546
 547 #define ckWARNreg(loc,m) STMT_START {                                   \
 548     const IV offset = loc - RExC_precomp;                               \
 549     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 550             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 551 } STMT_END
 552
 553 #define vWARN_dep(loc, m) STMT_START {                                  \
 554     const IV offset = loc - RExC_precomp;                               \
 555     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 556             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 557 } STMT_END
 558
 559 #define ckWARNdep(loc,m) STMT_START {                                   \
 560     const IV offset = loc - RExC_precomp;                               \
 561     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 562             m REPORT_LOCATION,                                          \
 563             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 564 } STMT_END
 565
 566 #define ckWARNregdep(loc,m) STMT_START {                                \
 567     const IV offset = loc - RExC_precomp;                               \
 568     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 569             m REPORT_LOCATION,                                          \
 570             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 571 } STMT_END
 572
 573 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 574     const IV offset = loc - RExC_precomp;                               \
 575     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 576             m REPORT_LOCATION,                                          \
 577             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 578 } STMT_END
 579
 580 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 581     const IV offset = loc - RExC_precomp;                               \
 582     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 583             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 584 } STMT_END
 585
 586 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 587     const IV offset = loc - RExC_precomp;                               \
 588     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 589             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 590 } STMT_END
 591
 592 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 593     const IV offset = loc - RExC_precomp;                               \
 594     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 595             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 596 } STMT_END
 597
 598 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 599     const IV offset = loc - RExC_precomp;                               \
 600     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 601             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 602 } STMT_END
 603
 604 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 605     const IV offset = loc - RExC_precomp;                               \
 606     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 607             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 608 } STMT_END
 609
 610 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 611     const IV offset = loc - RExC_precomp;                               \
 612     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 613             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 614 } STMT_END
 615
 616
 617 /* Allow for side effects in s */
 618 #define REGC(c,s) STMT_START {                  \
 619     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 620 } STMT_END
 621
 622 /* Macros for recording node offsets.   20001227 mjd@plover.com
 623  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 624  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 625  * Element 0 holds the number n.
 626  * Position is 1 indexed.
 627  */
 628 #ifndef RE_TRACK_PATTERN_OFFSETS
 629 #define Set_Node_Offset_To_R(node,byte)
 630 #define Set_Node_Offset(node,byte)
 631 #define Set_Cur_Node_Offset
 632 #define Set_Node_Length_To_R(node,len)
 633 #define Set_Node_Length(node,len)
 634 #define Set_Node_Cur_Length(node)
 635 #define Node_Offset(n)
 636 #define Node_Length(n)
 637 #define Set_Node_Offset_Length(node,offset,len)
 638 #define ProgLen(ri) ri->u.proglen
 639 #define SetProgLen(ri,x) ri->u.proglen = x
 640 #else
 641 #define ProgLen(ri) ri->u.offsets[0]
 642 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 643 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 644     if (! SIZE_ONLY) {                                                  \
 645         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 646                     __LINE__, (int)(node), (int)(byte)));               \
 647         if((node) < 0) {                                                \
 648             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 649         } else {                                                        \
 650             RExC_offsets[2*(node)-1] = (byte);                          \
 651         }                                                               \
 652     }                                                                   \
 653 } STMT_END
 654
 655 #define Set_Node_Offset(node,byte) \
 656     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 657 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 658
 659 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 660     if (! SIZE_ONLY) {                                                  \
 661         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 662                 __LINE__, (int)(node), (int)(len)));                    \
 663         if((node) < 0) {                                                \
 664             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 665         } else {                                                        \
 666             RExC_offsets[2*(node)] = (len);                             \
 667         }                                                               \
 668     }                                                                   \
 669 } STMT_END
 670
 671 #define Set_Node_Length(node,len) \
 672     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 673 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 674 #define Set_Node_Cur_Length(node) \
 675     Set_Node_Length(node, RExC_parse - parse_start)
 676
 677 /* Get offsets and lengths */
 678 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 679 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 680
 681 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 682     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 683     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 684 } STMT_END
 685 #endif
 686
 687 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 688 #define EXPERIMENTAL_INPLACESCAN
 689 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 690
 691 #define DEBUG_STUDYDATA(str,data,depth)                              \
 692 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 693     PerlIO_printf(Perl_debug_log,                                    \
 694         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 695         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 696         (int)(depth)*2, "",                                          \
 697         (IV)((data)->pos_min),                                       \
 698         (IV)((data)->pos_delta),                                     \
 699         (UV)((data)->flags),                                         \
 700         (IV)((data)->whilem_c),                                      \
 701         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 702         is_inf ? "INF " : ""                                         \
 703     );                                                               \
 704     if ((data)->last_found)                                          \
 705         PerlIO_printf(Perl_debug_log,                                \
 706             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 707             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 708             SvPVX_const((data)->last_found),                         \
 709             (IV)((data)->last_end),                                  \
 710             (IV)((data)->last_start_min),                            \
 711             (IV)((data)->last_start_max),                            \
 712             ((data)->longest &&                                      \
 713              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 714             SvPVX_const((data)->longest_fixed),                      \
 715             (IV)((data)->offset_fixed),                              \
 716             ((data)->longest &&                                      \
 717              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 718             SvPVX_const((data)->longest_float),                      \
 719             (IV)((data)->offset_float_min),                          \
 720             (IV)((data)->offset_float_max)                           \
 721         );                                                           \
 722     PerlIO_printf(Perl_debug_log,"\n");                              \
 723 });
 724
 725 /* Mark that we cannot extend a found fixed substring at this point.
 726    Update the longest found anchored substring and the longest found
 727    floating substrings if needed. */
 728
 729 STATIC void
 730 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 731 {
 732     const STRLEN l = CHR_SVLEN(data->last_found);
 733     const STRLEN old_l = CHR_SVLEN(*data->longest);
 734     GET_RE_DEBUG_FLAGS_DECL;
 735
 736     PERL_ARGS_ASSERT_SCAN_COMMIT;
 737
 738     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 739         SvSetMagicSV(*data->longest, data->last_found);
 740         if (*data->longest == data->longest_fixed) {
 741             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 742             if (data->flags & SF_BEFORE_EOL)
 743                 data->flags
 744                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 745             else
 746                 data->flags &= ~SF_FIX_BEFORE_EOL;
 747             data->minlen_fixed=minlenp;
 748             data->lookbehind_fixed=0;
 749         }
 750         else { /* *data->longest == data->longest_float */
 751             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 752             data->offset_float_max = (l
 753                                       ? data->last_start_max
 754                                       : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
 755             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 756                 data->offset_float_max = I32_MAX;
 757             if (data->flags & SF_BEFORE_EOL)
 758                 data->flags
 759                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 760             else
 761                 data->flags &= ~SF_FL_BEFORE_EOL;
 762             data->minlen_float=minlenp;
 763             data->lookbehind_float=0;
 764         }
 765     }
 766     SvCUR_set(data->last_found, 0);
 767     {
 768         SV * const sv = data->last_found;
 769         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 770             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 771             if (mg)
 772                 mg->mg_len = 0;
 773         }
 774     }
 775     data->last_end = -1;
 776     data->flags &= ~SF_BEFORE_EOL;
 777     DEBUG_STUDYDATA("commit: ",data,0);
 778 }
 779
 780 /* These macros set, clear and test whether the synthetic start class ('ssc',
 781  * given by the parameter) matches an empty string (EOS).  This uses the
 782  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 783  * stands alone, so there is never a next_off, so this field is otherwise
 784  * unused.  The EOS information is used only for compilation, but theoretically
 785  * it could be passed on to the execution code.  This could be used to store
 786  * more than one bit of information, but only this one is currently used. */
 787 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 788 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 789 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 790
 791 /* Can match anything (initialization) */
 792 STATIC void
 793 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 794 {
 795     PERL_ARGS_ASSERT_CL_ANYTHING;
 796
 797     ANYOF_BITMAP_SETALL(cl);
 798     cl->flags = ANYOF_UNICODE_ALL;
 799     SET_SSC_EOS(cl);
 800
 801     /* If any portion of the regex is to operate under locale rules,
 802      * initialization includes it.  The reason this isn't done for all regexes
 803      * is that the optimizer was written under the assumption that locale was
 804      * all-or-nothing.  Given the complexity and lack of documentation in the
 805      * optimizer, and that there are inadequate test cases for locale, so many
 806      * parts of it may not work properly, it is safest to avoid locale unless
 807      * necessary. */
 808     if (RExC_contains_locale) {
 809         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 810         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 811     }
 812     else {
 813         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 814     }
 815 }
 816
 817 /* Can match anything (initialization) */
 818 STATIC int
 819 S_cl_is_anything(const struct regnode_charclass_class *cl)
 820 {
 821     int value;
 822
 823     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 824
 825     for (value = 0; value < ANYOF_MAX; value += 2)
 826         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 827             return 1;
 828     if (!(cl->flags & ANYOF_UNICODE_ALL))
 829         return 0;
 830     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 831         return 0;
 832     return 1;
 833 }
 834
 835 /* Can match anything (initialization) */
 836 STATIC void
 837 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 838 {
 839     PERL_ARGS_ASSERT_CL_INIT;
 840
 841     Zero(cl, 1, struct regnode_charclass_class);
 842     cl->type = ANYOF;
 843     cl_anything(pRExC_state, cl);
 844     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 845 }
 846
 847 /* These two functions currently do the exact same thing */
 848 #define cl_init_zero            S_cl_init
 849
 850 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 851  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 852  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 853 STATIC void
 854 S_cl_and(struct regnode_charclass_class *cl,
 855         const struct regnode_charclass_class *and_with)
 856 {
 857     PERL_ARGS_ASSERT_CL_AND;
 858
 859     assert(PL_regkind[and_with->type] == ANYOF);
 860
 861     /* I (khw) am not sure all these restrictions are necessary XXX */
 862     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 863         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 864         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 865         && !(and_with->flags & ANYOF_LOC_FOLD)
 866         && !(cl->flags & ANYOF_LOC_FOLD)) {
 867         int i;
 868
 869         if (and_with->flags & ANYOF_INVERT)
 870             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 871                 cl->bitmap[i] &= ~and_with->bitmap[i];
 872         else
 873             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 874                 cl->bitmap[i] &= and_with->bitmap[i];
 875     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 876
 877     if (and_with->flags & ANYOF_INVERT) {
 878
 879         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 880          * aren't affected by the inversion.  Those that are affected are
 881          * handled individually below */
 882         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 883         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 884         cl->flags |= affected_flags;
 885
 886         /* We currently don't know how to deal with things that aren't in the
 887          * bitmap, but we know that the intersection is no greater than what
 888          * is already in cl, so let there be false positives that get sorted
 889          * out after the synthetic start class succeeds, and the node is
 890          * matched for real. */
 891
 892         /* The inversion of these two flags indicate that the resulting
 893          * intersection doesn't have them */
 894         if (and_with->flags & ANYOF_UNICODE_ALL) {
 895             cl->flags &= ~ANYOF_UNICODE_ALL;
 896         }
 897         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 898             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 899         }
 900     }
 901     else {   /* and'd node is not inverted */
 902         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 903
 904         if (! ANYOF_NONBITMAP(and_with)) {
 905
 906             /* Here 'and_with' doesn't match anything outside the bitmap
 907              * (except possibly ANYOF_UNICODE_ALL), which means the
 908              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 909              * which case we don't know what the intersection is, but it's no
 910              * greater than what cl already has, so can just leave it alone,
 911              * with possible false positives */
 912             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 913                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 914                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 915             }
 916         }
 917         else if (! ANYOF_NONBITMAP(cl)) {
 918
 919             /* Here, 'and_with' does match something outside the bitmap, and cl
 920              * doesn't have a list of things to match outside the bitmap.  If
 921              * cl can match all code points above 255, the intersection will
 922              * be those above-255 code points that 'and_with' matches.  If cl
 923              * can't match all Unicode code points, it means that it can't
 924              * match anything outside the bitmap (since the 'if' that got us
 925              * into this block tested for that), so we leave the bitmap empty.
 926              */
 927             if (cl->flags & ANYOF_UNICODE_ALL) {
 928                 ARG_SET(cl, ARG(and_with));
 929
 930                 /* and_with's ARG may match things that don't require UTF8.
 931                  * And now cl's will too, in spite of this being an 'and'.  See
 932                  * the comments below about the kludge */
 933                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 934             }
 935         }
 936         else {
 937             /* Here, both 'and_with' and cl match something outside the
 938              * bitmap.  Currently we do not do the intersection, so just match
 939              * whatever cl had at the beginning.  */
 940         }
 941
 942
 943         /* Take the intersection of the two sets of flags.  However, the
 944          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 945          * kludge around the fact that this flag is not treated like the others
 946          * which are initialized in cl_anything().  The way the optimizer works
 947          * is that the synthetic start class (SSC) is initialized to match
 948          * anything, and then the first time a real node is encountered, its
 949          * values are AND'd with the SSC's with the result being the values of
 950          * the real node.  However, there are paths through the optimizer where
 951          * the AND never gets called, so those initialized bits are set
 952          * inappropriately, which is not usually a big deal, as they just cause
 953          * false positives in the SSC, which will just mean a probably
 954          * imperceptible slow down in execution.  However this bit has a
 955          * higher false positive consequence in that it can cause utf8.pm,
 956          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 957          * bigger slowdown and also causes significant extra memory to be used.
 958          * In order to prevent this, the code now takes a different tack.  The
 959          * bit isn't set unless some part of the regular expression needs it,
 960          * but once set it won't get cleared.  This means that these extra
 961          * modules won't get loaded unless there was some path through the
 962          * pattern that would have required them anyway, and  so any false
 963          * positives that occur by not ANDing them out when they could be
 964          * aren't as severe as they would be if we treated this bit like all
 965          * the others */
 966         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 967                                       & ANYOF_NONBITMAP_NON_UTF8;
 968         cl->flags &= and_with->flags;
 969         cl->flags |= outside_bitmap_but_not_utf8;
 970     }
 971 }
 972
 973 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 974  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 975  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 976 STATIC void
 977 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 978 {
 979     PERL_ARGS_ASSERT_CL_OR;
 980
 981     if (or_with->flags & ANYOF_INVERT) {
 982
 983         /* Here, the or'd node is to be inverted.  This means we take the
 984          * complement of everything not in the bitmap, but currently we don't
 985          * know what that is, so give up and match anything */
 986         if (ANYOF_NONBITMAP(or_with)) {
 987             cl_anything(pRExC_state, cl);
 988         }
 989         /* We do not use
 990          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 991          *   <= (B1 | !B2) | (CL1 | !CL2)
 992          * which is wasteful if CL2 is small, but we ignore CL2:
 993          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 994          * XXXX Can we handle case-fold?  Unclear:
 995          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 996          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 997          */
 998         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 999              && !(or_with->flags & ANYOF_LOC_FOLD)
1000              && !(cl->flags & ANYOF_LOC_FOLD) ) {
1001             int i;
1002
1003             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1004                 cl->bitmap[i] |= ~or_with->bitmap[i];
1005         } /* XXXX: logic is complicated otherwise */
1006         else {
1007             cl_anything(pRExC_state, cl);
1008         }
1009
1010         /* And, we can just take the union of the flags that aren't affected
1011          * by the inversion */
1012         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1013
1014         /* For the remaining flags:
1015             ANYOF_UNICODE_ALL and inverted means to not match anything above
1016                     255, which means that the union with cl should just be
1017                     what cl has in it, so can ignore this flag
1018             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1019                     is 127-255 to match them, but then invert that, so the
1020                     union with cl should just be what cl has in it, so can
1021                     ignore this flag
1022          */
1023     } else {    /* 'or_with' is not inverted */
1024         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1025         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1026              && (!(or_with->flags & ANYOF_LOC_FOLD)
1027                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1028             int i;
1029
1030             /* OR char bitmap and class bitmap separately */
1031             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1032                 cl->bitmap[i] |= or_with->bitmap[i];
1033             if (or_with->flags & ANYOF_CLASS) {
1034                 ANYOF_CLASS_OR(or_with, cl);
1035             }
1036         }
1037         else { /* XXXX: logic is complicated, leave it along for a moment. */
1038             cl_anything(pRExC_state, cl);
1039         }
1040
1041         if (ANYOF_NONBITMAP(or_with)) {
1042
1043             /* Use the added node's outside-the-bit-map match if there isn't a
1044              * conflict.  If there is a conflict (both nodes match something
1045              * outside the bitmap, but what they match outside is not the same
1046              * pointer, and hence not easily compared until XXX we extend
1047              * inversion lists this far), give up and allow the start class to
1048              * match everything outside the bitmap.  If that stuff is all above
1049              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1050             if (! ANYOF_NONBITMAP(cl)) {
1051                 ARG_SET(cl, ARG(or_with));
1052             }
1053             else if (ARG(cl) != ARG(or_with)) {
1054
1055                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1056                     cl_anything(pRExC_state, cl);
1057                 }
1058                 else {
1059                     cl->flags |= ANYOF_UNICODE_ALL;
1060                 }
1061             }
1062         }
1063
1064         /* Take the union */
1065         cl->flags |= or_with->flags;
1066     }
1067 }
1068
1069 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1070 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1071 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1072 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1073
1074
1075 #ifdef DEBUGGING
1076 /*
1077    dump_trie(trie,widecharmap,revcharmap)
1078    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1079    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1080
1081    These routines dump out a trie in a somewhat readable format.
1082    The _interim_ variants are used for debugging the interim
1083    tables that are used to generate the final compressed
1084    representation which is what dump_trie expects.
1085
1086    Part of the reason for their existence is to provide a form
1087    of documentation as to how the different representations function.
1088
1089 */
1090
1091 /*
1092   Dumps the final compressed table form of the trie to Perl_debug_log.
1093   Used for debugging make_trie().
1094 */
1095
1096 STATIC void
1097 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1098             AV *revcharmap, U32 depth)
1099 {
1100     U32 state;
1101     SV *sv=sv_newmortal();
1102     int colwidth= widecharmap ? 6 : 4;
1103     U16 word;
1104     GET_RE_DEBUG_FLAGS_DECL;
1105
1106     PERL_ARGS_ASSERT_DUMP_TRIE;
1107
1108     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1109         (int)depth * 2 + 2,"",
1110         "Match","Base","Ofs" );
1111
1112     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1113         SV ** const tmp = av_fetch( revcharmap, state, 0);
1114         if ( tmp ) {
1115             PerlIO_printf( Perl_debug_log, "%*s",
1116                 colwidth,
1117                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1118                             PL_colors[0], PL_colors[1],
1119                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1120                             PERL_PV_ESCAPE_FIRSTCHAR
1121                 )
1122             );
1123         }
1124     }
1125     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1126         (int)depth * 2 + 2,"");
1127
1128     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1129         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1130     PerlIO_printf( Perl_debug_log, "\n");
1131
1132     for( state = 1 ; state < trie->statecount ; state++ ) {
1133         const U32 base = trie->states[ state ].trans.base;
1134
1135         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1136
1137         if ( trie->states[ state ].wordnum ) {
1138             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1139         } else {
1140             PerlIO_printf( Perl_debug_log, "%6s", "" );
1141         }
1142
1143         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1144
1145         if ( base ) {
1146             U32 ofs = 0;
1147
1148             while( ( base + ofs  < trie->uniquecharcount ) ||
1149                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1150                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1151                     ofs++;
1152
1153             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1154
1155             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1156                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1157                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1158                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1159                 {
1160                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1161                     colwidth,
1162                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1163                 } else {
1164                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1165                 }
1166             }
1167
1168             PerlIO_printf( Perl_debug_log, "]");
1169
1170         }
1171         PerlIO_printf( Perl_debug_log, "\n" );
1172     }
1173     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1174     for (word=1; word <= trie->wordcount; word++) {
1175         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1176             (int)word, (int)(trie->wordinfo[word].prev),
1177             (int)(trie->wordinfo[word].len));
1178     }
1179     PerlIO_printf(Perl_debug_log, "\n" );
1180 }
1181 /*
1182   Dumps a fully constructed but uncompressed trie in list form.
1183   List tries normally only are used for construction when the number of
1184   possible chars (trie->uniquecharcount) is very high.
1185   Used for debugging make_trie().
1186 */
1187 STATIC void
1188 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1189                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1190                          U32 depth)
1191 {
1192     U32 state;
1193     SV *sv=sv_newmortal();
1194     int colwidth= widecharmap ? 6 : 4;
1195     GET_RE_DEBUG_FLAGS_DECL;
1196
1197     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1198
1199     /* print out the table precompression.  */
1200     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1201         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1202         "------:-----+-----------------\n" );
1203
1204     for( state=1 ; state < next_alloc ; state ++ ) {
1205         U16 charid;
1206
1207         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1208             (int)depth * 2 + 2,"", (UV)state  );
1209         if ( ! trie->states[ state ].wordnum ) {
1210             PerlIO_printf( Perl_debug_log, "%5s| ","");
1211         } else {
1212             PerlIO_printf( Perl_debug_log, "W%4x| ",
1213                 trie->states[ state ].wordnum
1214             );
1215         }
1216         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1217             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1218             if ( tmp ) {
1219                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1220                     colwidth,
1221                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1222                             PL_colors[0], PL_colors[1],
1223                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1224                             PERL_PV_ESCAPE_FIRSTCHAR
1225                     ) ,
1226                     TRIE_LIST_ITEM(state,charid).forid,
1227                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1228                 );
1229                 if (!(charid % 10))
1230                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1231                         (int)((depth * 2) + 14), "");
1232             }
1233         }
1234         PerlIO_printf( Perl_debug_log, "\n");
1235     }
1236 }
1237
1238 /*
1239   Dumps a fully constructed but uncompressed trie in table form.
1240   This is the normal DFA style state transition table, with a few
1241   twists to facilitate compression later.
1242   Used for debugging make_trie().
1243 */
1244 STATIC void
1245 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1246                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1247                           U32 depth)
1248 {
1249     U32 state;
1250     U16 charid;
1251     SV *sv=sv_newmortal();
1252     int colwidth= widecharmap ? 6 : 4;
1253     GET_RE_DEBUG_FLAGS_DECL;
1254
1255     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1256
1257     /*
1258        print out the table precompression so that we can do a visual check
1259        that they are identical.
1260      */
1261
1262     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1263
1264     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1265         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1266         if ( tmp ) {
1267             PerlIO_printf( Perl_debug_log, "%*s",
1268                 colwidth,
1269                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1270                             PL_colors[0], PL_colors[1],
1271                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1272                             PERL_PV_ESCAPE_FIRSTCHAR
1273                 )
1274             );
1275         }
1276     }
1277
1278     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1279
1280     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1281         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1282     }
1283
1284     PerlIO_printf( Perl_debug_log, "\n" );
1285
1286     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1287
1288         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1289             (int)depth * 2 + 2,"",
1290             (UV)TRIE_NODENUM( state ) );
1291
1292         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1293             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1294             if (v)
1295                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1296             else
1297                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1298         }
1299         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1300             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1301         } else {
1302             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1303             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1304         }
1305     }
1306 }
1307
1308 #endif
1309
1310
1311 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1312   startbranch: the first branch in the whole branch sequence
1313   first      : start branch of sequence of branch-exact nodes.
1314                May be the same as startbranch
1315   last       : Thing following the last branch.
1316                May be the same as tail.
1317   tail       : item following the branch sequence
1318   count      : words in the sequence
1319   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1320   depth      : indent depth
1321
1322 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1323
1324 A trie is an N'ary tree where the branches are determined by digital
1325 decomposition of the key. IE, at the root node you look up the 1st character and
1326 follow that branch repeat until you find the end of the branches. Nodes can be
1327 marked as "accepting" meaning they represent a complete word. Eg:
1328
1329   /he|she|his|hers/
1330
1331 would convert into the following structure. Numbers represent states, letters
1332 following numbers represent valid transitions on the letter from that state, if
1333 the number is in square brackets it represents an accepting state, otherwise it
1334 will be in parenthesis.
1335
1336       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1337       |    |
1338       |   (2)
1339       |    |
1340      (1)   +-i->(6)-+-s->[7]
1341       |
1342       +-s->(3)-+-h->(4)-+-e->[5]
1343
1344       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1345
1346 This shows that when matching against the string 'hers' we will begin at state 1
1347 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1348 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1349 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1350 single traverse. We store a mapping from accepting to state to which word was
1351 matched, and then when we have multiple possibilities we try to complete the
1352 rest of the regex in the order in which they occured in the alternation.
1353
1354 The only prior NFA like behaviour that would be changed by the TRIE support is
1355 the silent ignoring of duplicate alternations which are of the form:
1356
1357  / (DUPE|DUPE) X? (?{ ... }) Y /x
1358
1359 Thus EVAL blocks following a trie may be called a different number of times with
1360 and without the optimisation. With the optimisations dupes will be silently
1361 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1362 the following demonstrates:
1363
1364  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1365
1366 which prints out 'word' three times, but
1367
1368  'words'=~/(word|word|word)(?{ print $1 })S/
1369
1370 which doesnt print it out at all. This is due to other optimisations kicking in.
1371
1372 Example of what happens on a structural level:
1373
1374 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1375
1376    1: CURLYM[1] {1,32767}(18)
1377    5:   BRANCH(8)
1378    6:     EXACT <ac>(16)
1379    8:   BRANCH(11)
1380    9:     EXACT <ad>(16)
1381   11:   BRANCH(14)
1382   12:     EXACT <ab>(16)
1383   16:   SUCCEED(0)
1384   17:   NOTHING(18)
1385   18: END(0)
1386
1387 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1388 and should turn into:
1389
1390    1: CURLYM[1] {1,32767}(18)
1391    5:   TRIE(16)
1392         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1393           <ac>
1394           <ad>
1395           <ab>
1396   16:   SUCCEED(0)
1397   17:   NOTHING(18)
1398   18: END(0)
1399
1400 Cases where tail != last would be like /(?foo|bar)baz/:
1401
1402    1: BRANCH(4)
1403    2:   EXACT <foo>(8)
1404    4: BRANCH(7)
1405    5:   EXACT <bar>(8)
1406    7: TAIL(8)
1407    8: EXACT <baz>(10)
1408   10: END(0)
1409
1410 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1411 and would end up looking like:
1412
1413     1: TRIE(8)
1414       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1415         <foo>
1416         <bar>
1417    7: TAIL(8)
1418    8: EXACT <baz>(10)
1419   10: END(0)
1420
1421     d = uvuni_to_utf8_flags(d, uv, 0);
1422
1423 is the recommended Unicode-aware way of saying
1424
1425     *(d++) = uv;
1426 */
1427
1428 #define TRIE_STORE_REVCHAR(val)                                            \
1429     STMT_START {                                                           \
1430         if (UTF) {                                                         \
1431             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1432             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1433             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1434             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1435             SvPOK_on(zlopp);                                               \
1436             SvUTF8_on(zlopp);                                              \
1437             av_push(revcharmap, zlopp);                                    \
1438         } else {                                                           \
1439             char ooooff = (char)val;                                           \
1440             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1441         }                                                                  \
1442         } STMT_END
1443
1444 #define TRIE_READ_CHAR STMT_START {                                                     \
1445     wordlen++;                                                                          \
1446     if ( UTF ) {                                                                        \
1447         /* if it is UTF then it is either already folded, or does not need folding */   \
1448         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1449     }                                                                                   \
1450     else if (folder == PL_fold_latin1) {                                                \
1451         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1452         if ( foldlen > 0 ) {                                                            \
1453            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1454            foldlen -= len;                                                              \
1455            scan += len;                                                                 \
1456            len = 0;                                                                     \
1457         } else {                                                                        \
1458             len = 1;                                                                    \
1459             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1460             skiplen = UNISKIP(uvc);                                                     \
1461             foldlen -= skiplen;                                                         \
1462             scan = foldbuf + skiplen;                                                   \
1463         }                                                                               \
1464     } else {                                                                            \
1465         /* raw data, will be folded later if needed */                                  \
1466         uvc = (U32)*uc;                                                                 \
1467         len = 1;                                                                        \
1468     }                                                                                   \
1469 } STMT_END
1470
1471
1472
1473 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1474     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1475         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1476         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1477     }                                                           \
1478     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1479     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1480     TRIE_LIST_CUR( state )++;                                   \
1481 } STMT_END
1482
1483 #define TRIE_LIST_NEW(state) STMT_START {                       \
1484     Newxz( trie->states[ state ].trans.list,               \
1485         4, reg_trie_trans_le );                                 \
1486      TRIE_LIST_CUR( state ) = 1;                                \
1487      TRIE_LIST_LEN( state ) = 4;                                \
1488 } STMT_END
1489
1490 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1491     U16 dupe= trie->states[ state ].wordnum;                    \
1492     regnode * const noper_next = regnext( noper );              \
1493                                                                 \
1494     DEBUG_r({                                                   \
1495         /* store the word for dumping */                        \
1496         SV* tmp;                                                \
1497         if (OP(noper) != NOTHING)                               \
1498             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1499         else                                                    \
1500             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1501         av_push( trie_words, tmp );                             \
1502     });                                                         \
1503                                                                 \
1504     curword++;                                                  \
1505     trie->wordinfo[curword].prev   = 0;                         \
1506     trie->wordinfo[curword].len    = wordlen;                   \
1507     trie->wordinfo[curword].accept = state;                     \
1508                                                                 \
1509     if ( noper_next < tail ) {                                  \
1510         if (!trie->jump)                                        \
1511             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1512         trie->jump[curword] = (U16)(noper_next - convert);      \
1513         if (!jumper)                                            \
1514             jumper = noper_next;                                \
1515         if (!nextbranch)                                        \
1516             nextbranch= regnext(cur);                           \
1517     }                                                           \
1518                                                                 \
1519     if ( dupe ) {                                               \
1520         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1521         /* chain, so that when the bits of chain are later    */\
1522         /* linked together, the dups appear in the chain      */\
1523         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1524         trie->wordinfo[dupe].prev = curword;                    \
1525     } else {                                                    \
1526         /* we haven't inserted this word yet.                */ \
1527         trie->states[ state ].wordnum = curword;                \
1528     }                                                           \
1529 } STMT_END
1530
1531
1532 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1533      ( ( base + charid >=  ucharcount                                   \
1534          && base + charid < ubound                                      \
1535          && state == trie->trans[ base - ucharcount + charid ].check    \
1536          && trie->trans[ base - ucharcount + charid ].next )            \
1537            ? trie->trans[ base - ucharcount + charid ].next             \
1538            : ( state==1 ? special : 0 )                                 \
1539       )
1540
1541 #define MADE_TRIE       1
1542 #define MADE_JUMP_TRIE  2
1543 #define MADE_EXACT_TRIE 4
1544
1545 STATIC I32
1546 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1547 {
1548     dVAR;
1549     /* first pass, loop through and scan words */
1550     reg_trie_data *trie;
1551     HV *widecharmap = NULL;
1552     AV *revcharmap = newAV();
1553     regnode *cur;
1554     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1555     STRLEN len = 0;
1556     UV uvc = 0;
1557     U16 curword = 0;
1558     U32 next_alloc = 0;
1559     regnode *jumper = NULL;
1560     regnode *nextbranch = NULL;
1561     regnode *convert = NULL;
1562     U32 *prev_states; /* temp array mapping each state to previous one */
1563     /* we just use folder as a flag in utf8 */
1564     const U8 * folder = NULL;
1565
1566 #ifdef DEBUGGING
1567     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1568     AV *trie_words = NULL;
1569     /* along with revcharmap, this only used during construction but both are
1570      * useful during debugging so we store them in the struct when debugging.
1571      */
1572 #else
1573     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1574     STRLEN trie_charcount=0;
1575 #endif
1576     SV *re_trie_maxbuff;
1577     GET_RE_DEBUG_FLAGS_DECL;
1578
1579     PERL_ARGS_ASSERT_MAKE_TRIE;
1580 #ifndef DEBUGGING
1581     PERL_UNUSED_ARG(depth);
1582 #endif
1583
1584     switch (flags) {
1585         case EXACT: break;
1586         case EXACTFA:
1587         case EXACTFU_SS:
1588         case EXACTFU_TRICKYFOLD:
1589         case EXACTFU: folder = PL_fold_latin1; break;
1590         case EXACTF:  folder = PL_fold; break;
1591         case EXACTFL: folder = PL_fold_locale; break;
1592         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1593     }
1594
1595     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1596     trie->refcount = 1;
1597     trie->startstate = 1;
1598     trie->wordcount = word_count;
1599     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1600     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1601     if (flags == EXACT)
1602         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1603     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1604                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1605
1606     DEBUG_r({
1607         trie_words = newAV();
1608     });
1609
1610     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1611     if (!SvIOK(re_trie_maxbuff)) {
1612         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1613     }
1614     DEBUG_TRIE_COMPILE_r({
1615                 PerlIO_printf( Perl_debug_log,
1616                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1617                   (int)depth * 2 + 2, "",
1618                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1619                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1620                   (int)depth);
1621     });
1622
1623    /* Find the node we are going to overwrite */
1624     if ( first == startbranch && OP( last ) != BRANCH ) {
1625         /* whole branch chain */
1626         convert = first;
1627     } else {
1628         /* branch sub-chain */
1629         convert = NEXTOPER( first );
1630     }
1631
1632     /*  -- First loop and Setup --
1633
1634        We first traverse the branches and scan each word to determine if it
1635        contains widechars, and how many unique chars there are, this is
1636        important as we have to build a table with at least as many columns as we
1637        have unique chars.
1638
1639        We use an array of integers to represent the character codes 0..255
1640        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1641        native representation of the character value as the key and IV's for the
1642        coded index.
1643
1644        *TODO* If we keep track of how many times each character is used we can
1645        remap the columns so that the table compression later on is more
1646        efficient in terms of memory by ensuring the most common value is in the
1647        middle and the least common are on the outside.  IMO this would be better
1648        than a most to least common mapping as theres a decent chance the most
1649        common letter will share a node with the least common, meaning the node
1650        will not be compressible. With a middle is most common approach the worst
1651        case is when we have the least common nodes twice.
1652
1653      */
1654
1655     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1656         regnode *noper = NEXTOPER( cur );
1657         const U8 *uc = (U8*)STRING( noper );
1658         const U8 *e  = uc + STR_LEN( noper );
1659         STRLEN foldlen = 0;
1660         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1661         STRLEN skiplen = 0;
1662         const U8 *scan = (U8*)NULL;
1663         U32 wordlen      = 0;         /* required init */
1664         STRLEN chars = 0;
1665         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1666
1667         if (OP(noper) == NOTHING) {
1668             regnode *noper_next= regnext(noper);
1669             if (noper_next != tail && OP(noper_next) == flags) {
1670                 noper = noper_next;
1671                 uc= (U8*)STRING(noper);
1672                 e= uc + STR_LEN(noper);
1673                 trie->minlen= STR_LEN(noper);
1674             } else {
1675                 trie->minlen= 0;
1676                 continue;
1677             }
1678         }
1679
1680         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1681             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1682                                           regardless of encoding */
1683             if (OP( noper ) == EXACTFU_SS) {
1684                 /* false positives are ok, so just set this */
1685                 TRIE_BITMAP_SET(trie,0xDF);
1686             }
1687         }
1688         for ( ; uc < e ; uc += len ) {
1689             TRIE_CHARCOUNT(trie)++;
1690             TRIE_READ_CHAR;
1691             chars++;
1692             if ( uvc < 256 ) {
1693                 if ( folder ) {
1694                     U8 folded= folder[ (U8) uvc ];
1695                     if ( !trie->charmap[ folded ] ) {
1696                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1697                         TRIE_STORE_REVCHAR( folded );
1698                     }
1699                 }
1700                 if ( !trie->charmap[ uvc ] ) {
1701                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1702                     TRIE_STORE_REVCHAR( uvc );
1703                 }
1704                 if ( set_bit ) {
1705                     /* store the codepoint in the bitmap, and its folded
1706                      * equivalent. */
1707                     TRIE_BITMAP_SET(trie, uvc);
1708
1709                     /* store the folded codepoint */
1710                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1711
1712                     if ( !UTF ) {
1713                         /* store first byte of utf8 representation of
1714                            variant codepoints */
1715                         if (! UNI_IS_INVARIANT(uvc)) {
1716                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1717                         }
1718                     }
1719                     set_bit = 0; /* We've done our bit :-) */
1720                 }
1721             } else {
1722                 SV** svpp;
1723                 if ( !widecharmap )
1724                     widecharmap = newHV();
1725
1726                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1727
1728                 if ( !svpp )
1729                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1730
1731                 if ( !SvTRUE( *svpp ) ) {
1732                     sv_setiv( *svpp, ++trie->uniquecharcount );
1733                     TRIE_STORE_REVCHAR(uvc);
1734                 }
1735             }
1736         }
1737         if( cur == first ) {
1738             trie->minlen = chars;
1739             trie->maxlen = chars;
1740         } else if (chars < trie->minlen) {
1741             trie->minlen = chars;
1742         } else if (chars > trie->maxlen) {
1743             trie->maxlen = chars;
1744         }
1745         if (OP( noper ) == EXACTFU_SS) {
1746             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1747             if (trie->minlen > 1)
1748                 trie->minlen= 1;
1749         }
1750         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1751             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1752              *                - We assume that any such sequence might match a 2 byte string */
1753             if (trie->minlen > 2 )
1754                 trie->minlen= 2;
1755         }
1756
1757     } /* end first pass */
1758     DEBUG_TRIE_COMPILE_r(
1759         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1760                 (int)depth * 2 + 2,"",
1761                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1762                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1763                 (int)trie->minlen, (int)trie->maxlen )
1764     );
1765
1766     /*
1767         We now know what we are dealing with in terms of unique chars and
1768         string sizes so we can calculate how much memory a naive
1769         representation using a flat table  will take. If it's over a reasonable
1770         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1771         conservative but potentially much slower representation using an array
1772         of lists.
1773
1774         At the end we convert both representations into the same compressed
1775         form that will be used in regexec.c for matching with. The latter
1776         is a form that cannot be used to construct with but has memory
1777         properties similar to the list form and access properties similar
1778         to the table form making it both suitable for fast searches and
1779         small enough that its feasable to store for the duration of a program.
1780
1781         See the comment in the code where the compressed table is produced
1782         inplace from the flat tabe representation for an explanation of how
1783         the compression works.
1784
1785     */
1786
1787
1788     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1789     prev_states[1] = 0;
1790
1791     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1792         /*
1793             Second Pass -- Array Of Lists Representation
1794
1795             Each state will be represented by a list of charid:state records
1796             (reg_trie_trans_le) the first such element holds the CUR and LEN
1797             points of the allocated array. (See defines above).
1798
1799             We build the initial structure using the lists, and then convert
1800             it into the compressed table form which allows faster lookups
1801             (but cant be modified once converted).
1802         */
1803
1804         STRLEN transcount = 1;
1805
1806         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1807             "%*sCompiling trie using list compiler\n",
1808             (int)depth * 2 + 2, ""));
1809
1810         trie->states = (reg_trie_state *)
1811             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1812                                   sizeof(reg_trie_state) );
1813         TRIE_LIST_NEW(1);
1814         next_alloc = 2;
1815
1816         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1817
1818             regnode *noper   = NEXTOPER( cur );
1819             U8 *uc           = (U8*)STRING( noper );
1820             const U8 *e      = uc + STR_LEN( noper );
1821             U32 state        = 1;         /* required init */
1822             U16 charid       = 0;         /* sanity init */
1823             U8 *scan         = (U8*)NULL; /* sanity init */
1824             STRLEN foldlen   = 0;         /* required init */
1825             U32 wordlen      = 0;         /* required init */
1826             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1827             STRLEN skiplen   = 0;
1828
1829             if (OP(noper) == NOTHING) {
1830                 regnode *noper_next= regnext(noper);
1831                 if (noper_next != tail && OP(noper_next) == flags) {
1832                     noper = noper_next;
1833                     uc= (U8*)STRING(noper);
1834                     e= uc + STR_LEN(noper);
1835                 }
1836             }
1837
1838             if (OP(noper) != NOTHING) {
1839                 for ( ; uc < e ; uc += len ) {
1840
1841                     TRIE_READ_CHAR;
1842
1843                     if ( uvc < 256 ) {
1844                         charid = trie->charmap[ uvc ];
1845                     } else {
1846                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1847                         if ( !svpp ) {
1848                             charid = 0;
1849                         } else {
1850                             charid=(U16)SvIV( *svpp );
1851                         }
1852                     }
1853                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1854                     if ( charid ) {
1855
1856                         U16 check;
1857                         U32 newstate = 0;
1858
1859                         charid--;
1860                         if ( !trie->states[ state ].trans.list ) {
1861                             TRIE_LIST_NEW( state );
1862                         }
1863                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1864                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1865                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1866                                 break;
1867                             }
1868                         }
1869                         if ( ! newstate ) {
1870                             newstate = next_alloc++;
1871                             prev_states[newstate] = state;
1872                             TRIE_LIST_PUSH( state, charid, newstate );
1873                             transcount++;
1874                         }
1875                         state = newstate;
1876                     } else {
1877                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1878                     }
1879                 }
1880             }
1881             TRIE_HANDLE_WORD(state);
1882
1883         } /* end second pass */
1884
1885         /* next alloc is the NEXT state to be allocated */
1886         trie->statecount = next_alloc;
1887         trie->states = (reg_trie_state *)
1888             PerlMemShared_realloc( trie->states,
1889                                    next_alloc
1890                                    * sizeof(reg_trie_state) );
1891
1892         /* and now dump it out before we compress it */
1893         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1894                                                          revcharmap, next_alloc,
1895                                                          depth+1)
1896         );
1897
1898         trie->trans = (reg_trie_trans *)
1899             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1900         {
1901             U32 state;
1902             U32 tp = 0;
1903             U32 zp = 0;
1904
1905
1906             for( state=1 ; state < next_alloc ; state ++ ) {
1907                 U32 base=0;
1908
1909                 /*
1910                 DEBUG_TRIE_COMPILE_MORE_r(
1911                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1912                 );
1913                 */
1914
1915                 if (trie->states[state].trans.list) {
1916                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1917                     U16 maxid=minid;
1918                     U16 idx;
1919
1920                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1921                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1922                         if ( forid < minid ) {
1923                             minid=forid;
1924                         } else if ( forid > maxid ) {
1925                             maxid=forid;
1926                         }
1927                     }
1928                     if ( transcount < tp + maxid - minid + 1) {
1929                         transcount *= 2;
1930                         trie->trans = (reg_trie_trans *)
1931                             PerlMemShared_realloc( trie->trans,
1932                                                      transcount
1933                                                      * sizeof(reg_trie_trans) );
1934                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1935                     }
1936                     base = trie->uniquecharcount + tp - minid;
1937                     if ( maxid == minid ) {
1938                         U32 set = 0;
1939                         for ( ; zp < tp ; zp++ ) {
1940                             if ( ! trie->trans[ zp ].next ) {
1941                                 base = trie->uniquecharcount + zp - minid;
1942                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1943                                 trie->trans[ zp ].check = state;
1944                                 set = 1;
1945                                 break;
1946                             }
1947                         }
1948                         if ( !set ) {
1949                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1950                             trie->trans[ tp ].check = state;
1951                             tp++;
1952                             zp = tp;
1953                         }
1954                     } else {
1955                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1956                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1957                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1958                             trie->trans[ tid ].check = state;
1959                         }
1960                         tp += ( maxid - minid + 1 );
1961                     }
1962                     Safefree(trie->states[ state ].trans.list);
1963                 }
1964                 /*
1965                 DEBUG_TRIE_COMPILE_MORE_r(
1966                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1967                 );
1968                 */
1969                 trie->states[ state ].trans.base=base;
1970             }
1971             trie->lasttrans = tp + 1;
1972         }
1973     } else {
1974         /*
1975            Second Pass -- Flat Table Representation.
1976
1977            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1978            We know that we will need Charcount+1 trans at most to store the data
1979            (one row per char at worst case) So we preallocate both structures
1980            assuming worst case.
1981
1982            We then construct the trie using only the .next slots of the entry
1983            structs.
1984
1985            We use the .check field of the first entry of the node temporarily to
1986            make compression both faster and easier by keeping track of how many non
1987            zero fields are in the node.
1988
1989            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1990            transition.
1991
1992            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1993            number representing the first entry of the node, and state as a
1994            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1995            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1996            are 2 entrys per node. eg:
1997
1998              A B       A B
1999           1. 2 4    1. 3 7
2000           2. 0 3    3. 0 5
2001           3. 0 0    5. 0 0
2002           4. 0 0    7. 0 0
2003
2004            The table is internally in the right hand, idx form. However as we also
2005            have to deal with the states array which is indexed by nodenum we have to
2006            use TRIE_NODENUM() to convert.
2007
2008         */
2009         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2010             "%*sCompiling trie using table compiler\n",
2011             (int)depth * 2 + 2, ""));
2012
2013         trie->trans = (reg_trie_trans *)
2014             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2015                                   * trie->uniquecharcount + 1,
2016                                   sizeof(reg_trie_trans) );
2017         trie->states = (reg_trie_state *)
2018             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2019                                   sizeof(reg_trie_state) );
2020         next_alloc = trie->uniquecharcount + 1;
2021
2022
2023         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2024
2025             regnode *noper   = NEXTOPER( cur );
2026             const U8 *uc     = (U8*)STRING( noper );
2027             const U8 *e      = uc + STR_LEN( noper );
2028
2029             U32 state        = 1;         /* required init */
2030
2031             U16 charid       = 0;         /* sanity init */
2032             U32 accept_state = 0;         /* sanity init */
2033             U8 *scan         = (U8*)NULL; /* sanity init */
2034
2035             STRLEN foldlen   = 0;         /* required init */
2036             U32 wordlen      = 0;         /* required init */
2037             STRLEN skiplen   = 0;
2038             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2039
2040             if (OP(noper) == NOTHING) {
2041                 regnode *noper_next= regnext(noper);
2042                 if (noper_next != tail && OP(noper_next) == flags) {
2043                     noper = noper_next;
2044                     uc= (U8*)STRING(noper);
2045                     e= uc + STR_LEN(noper);
2046                 }
2047             }
2048
2049             if ( OP(noper) != NOTHING ) {
2050                 for ( ; uc < e ; uc += len ) {
2051
2052                     TRIE_READ_CHAR;
2053
2054                     if ( uvc < 256 ) {
2055                         charid = trie->charmap[ uvc ];
2056                     } else {
2057                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2058                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2059                     }
2060                     if ( charid ) {
2061                         charid--;
2062                         if ( !trie->trans[ state + charid ].next ) {
2063                             trie->trans[ state + charid ].next = next_alloc;
2064                             trie->trans[ state ].check++;
2065                             prev_states[TRIE_NODENUM(next_alloc)]
2066                                     = TRIE_NODENUM(state);
2067                             next_alloc += trie->uniquecharcount;
2068                         }
2069                         state = trie->trans[ state + charid ].next;
2070                     } else {
2071                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2072                     }
2073                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2074                 }
2075             }
2076             accept_state = TRIE_NODENUM( state );
2077             TRIE_HANDLE_WORD(accept_state);
2078
2079         } /* end second pass */
2080
2081         /* and now dump it out before we compress it */
2082         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2083                                                           revcharmap,
2084                                                           next_alloc, depth+1));
2085
2086         {
2087         /*
2088            * Inplace compress the table.*
2089
2090            For sparse data sets the table constructed by the trie algorithm will
2091            be mostly 0/FAIL transitions or to put it another way mostly empty.
2092            (Note that leaf nodes will not contain any transitions.)
2093
2094            This algorithm compresses the tables by eliminating most such
2095            transitions, at the cost of a modest bit of extra work during lookup:
2096
2097            - Each states[] entry contains a .base field which indicates the
2098            index in the state[] array wheres its transition data is stored.
2099
2100            - If .base is 0 there are no valid transitions from that node.
2101
2102            - If .base is nonzero then charid is added to it to find an entry in
2103            the trans array.
2104
2105            -If trans[states[state].base+charid].check!=state then the
2106            transition is taken to be a 0/Fail transition. Thus if there are fail
2107            transitions at the front of the node then the .base offset will point
2108            somewhere inside the previous nodes data (or maybe even into a node
2109            even earlier), but the .check field determines if the transition is
2110            valid.
2111
2112            XXX - wrong maybe?
2113            The following process inplace converts the table to the compressed
2114            table: We first do not compress the root node 1,and mark all its
2115            .check pointers as 1 and set its .base pointer as 1 as well. This
2116            allows us to do a DFA construction from the compressed table later,
2117            and ensures that any .base pointers we calculate later are greater
2118            than 0.
2119
2120            - We set 'pos' to indicate the first entry of the second node.
2121
2122            - We then iterate over the columns of the node, finding the first and
2123            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2124            and set the .check pointers accordingly, and advance pos
2125            appropriately and repreat for the next node. Note that when we copy
2126            the next pointers we have to convert them from the original
2127            NODEIDX form to NODENUM form as the former is not valid post
2128            compression.
2129
2130            - If a node has no transitions used we mark its base as 0 and do not
2131            advance the pos pointer.
2132
2133            - If a node only has one transition we use a second pointer into the
2134            structure to fill in allocated fail transitions from other states.
2135            This pointer is independent of the main pointer and scans forward
2136            looking for null transitions that are allocated to a state. When it
2137            finds one it writes the single transition into the "hole".  If the
2138            pointer doesnt find one the single transition is appended as normal.
2139
2140            - Once compressed we can Renew/realloc the structures to release the
2141            excess space.
2142
2143            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2144            specifically Fig 3.47 and the associated pseudocode.
2145
2146            demq
2147         */
2148         const U32 laststate = TRIE_NODENUM( next_alloc );
2149         U32 state, charid;
2150         U32 pos = 0, zp=0;
2151         trie->statecount = laststate;
2152
2153         for ( state = 1 ; state < laststate ; state++ ) {
2154             U8 flag = 0;
2155             const U32 stateidx = TRIE_NODEIDX( state );
2156             const U32 o_used = trie->trans[ stateidx ].check;
2157             U32 used = trie->trans[ stateidx ].check;
2158             trie->trans[ stateidx ].check = 0;
2159
2160             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2161                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2162                     if ( trie->trans[ stateidx + charid ].next ) {
2163                         if (o_used == 1) {
2164                             for ( ; zp < pos ; zp++ ) {
2165                                 if ( ! trie->trans[ zp ].next ) {
2166                                     break;
2167                                 }
2168                             }
2169                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2170                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2171                             trie->trans[ zp ].check = state;
2172                             if ( ++zp > pos ) pos = zp;
2173                             break;
2174                         }
2175                         used--;
2176                     }
2177                     if ( !flag ) {
2178                         flag = 1;
2179                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2180                     }
2181                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2182                     trie->trans[ pos ].check = state;
2183                     pos++;
2184                 }
2185             }
2186         }
2187         trie->lasttrans = pos + 1;
2188         trie->states = (reg_trie_state *)
2189             PerlMemShared_realloc( trie->states, laststate
2190                                    * sizeof(reg_trie_state) );
2191         DEBUG_TRIE_COMPILE_MORE_r(
2192                 PerlIO_printf( Perl_debug_log,
2193                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2194                     (int)depth * 2 + 2,"",
2195                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2196                     (IV)next_alloc,
2197                     (IV)pos,
2198                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2199             );
2200
2201         } /* end table compress */
2202     }
2203     DEBUG_TRIE_COMPILE_MORE_r(
2204             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2205                 (int)depth * 2 + 2, "",
2206                 (UV)trie->statecount,
2207                 (UV)trie->lasttrans)
2208     );
2209     /* resize the trans array to remove unused space */
2210     trie->trans = (reg_trie_trans *)
2211         PerlMemShared_realloc( trie->trans, trie->lasttrans
2212                                * sizeof(reg_trie_trans) );
2213
2214     {   /* Modify the program and insert the new TRIE node */
2215         U8 nodetype =(U8)(flags & 0xFF);
2216         char *str=NULL;
2217
2218 #ifdef DEBUGGING
2219         regnode *optimize = NULL;
2220 #ifdef RE_TRACK_PATTERN_OFFSETS
2221
2222         U32 mjd_offset = 0;
2223         U32 mjd_nodelen = 0;
2224 #endif /* RE_TRACK_PATTERN_OFFSETS */
2225 #endif /* DEBUGGING */
2226         /*
2227            This means we convert either the first branch or the first Exact,
2228            depending on whether the thing following (in 'last') is a branch
2229            or not and whther first is the startbranch (ie is it a sub part of
2230            the alternation or is it the whole thing.)
2231            Assuming its a sub part we convert the EXACT otherwise we convert
2232            the whole branch sequence, including the first.
2233          */
2234         /* Find the node we are going to overwrite */
2235         if ( first != startbranch || OP( last ) == BRANCH ) {
2236             /* branch sub-chain */
2237             NEXT_OFF( first ) = (U16)(last - first);
2238 #ifdef RE_TRACK_PATTERN_OFFSETS
2239             DEBUG_r({
2240                 mjd_offset= Node_Offset((convert));
2241                 mjd_nodelen= Node_Length((convert));
2242             });
2243 #endif
2244             /* whole branch chain */
2245         }
2246 #ifdef RE_TRACK_PATTERN_OFFSETS
2247         else {
2248             DEBUG_r({
2249                 const  regnode *nop = NEXTOPER( convert );
2250                 mjd_offset= Node_Offset((nop));
2251                 mjd_nodelen= Node_Length((nop));
2252             });
2253         }
2254         DEBUG_OPTIMISE_r(
2255             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2256                 (int)depth * 2 + 2, "",
2257                 (UV)mjd_offset, (UV)mjd_nodelen)
2258         );
2259 #endif
2260         /* But first we check to see if there is a common prefix we can
2261            split out as an EXACT and put in front of the TRIE node.  */
2262         trie->startstate= 1;
2263         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2264             U32 state;
2265             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2266                 U32 ofs = 0;
2267                 I32 idx = -1;
2268                 U32 count = 0;
2269                 const U32 base = trie->states[ state ].trans.base;
2270
2271                 if ( trie->states[state].wordnum )
2272                         count = 1;
2273
2274                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2275                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2276                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2277                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2278                     {
2279                         if ( ++count > 1 ) {
2280                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2281                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2282                             if ( state == 1 ) break;
2283                             if ( count == 2 ) {
2284                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2285                                 DEBUG_OPTIMISE_r(
2286                                     PerlIO_printf(Perl_debug_log,
2287                                         "%*sNew Start State=%"UVuf" Class: [",
2288                                         (int)depth * 2 + 2, "",
2289                                         (UV)state));
2290                                 if (idx >= 0) {
2291                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2292                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2293
2294                                     TRIE_BITMAP_SET(trie,*ch);
2295                                     if ( folder )
2296                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2297                                     DEBUG_OPTIMISE_r(
2298                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2299                                     );
2300                                 }
2301                             }
2302                             TRIE_BITMAP_SET(trie,*ch);
2303                             if ( folder )
2304                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2305                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2306                         }
2307                         idx = ofs;
2308                     }
2309                 }
2310                 if ( count == 1 ) {
2311                     SV **tmp = av_fetch( revcharmap, idx, 0);
2312                     STRLEN len;
2313                     char *ch = SvPV( *tmp, len );
2314                     DEBUG_OPTIMISE_r({
2315                         SV *sv=sv_newmortal();
2316                         PerlIO_printf( Perl_debug_log,
2317                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2318                             (int)depth * 2 + 2, "",
2319                             (UV)state, (UV)idx,
2320                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2321                                 PL_colors[0], PL_colors[1],
2322                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2323                                 PERL_PV_ESCAPE_FIRSTCHAR
2324                             )
2325                         );
2326                     });
2327                     if ( state==1 ) {
2328                         OP( convert ) = nodetype;
2329                         str=STRING(convert);
2330                         STR_LEN(convert)=0;
2331                     }
2332                     STR_LEN(convert) += len;
2333                     while (len--)
2334                         *str++ = *ch++;
2335                 } else {
2336 #ifdef DEBUGGING
2337                     if (state>1)
2338                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2339 #endif
2340                     break;
2341                 }
2342             }
2343             trie->prefixlen = (state-1);
2344             if (str) {
2345                 regnode *n = convert+NODE_SZ_STR(convert);
2346                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2347                 trie->startstate = state;
2348                 trie->minlen -= (state - 1);
2349                 trie->maxlen -= (state - 1);
2350 #ifdef DEBUGGING
2351                /* At least the UNICOS C compiler choked on this
2352                 * being argument to DEBUG_r(), so let's just have
2353                 * it right here. */
2354                if (
2355 #ifdef PERL_EXT_RE_BUILD
2356                    1
2357 #else
2358                    DEBUG_r_TEST
2359 #endif
2360                    ) {
2361                    regnode *fix = convert;
2362                    U32 word = trie->wordcount;
2363                    mjd_nodelen++;
2364                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2365                    while( ++fix < n ) {
2366                        Set_Node_Offset_Length(fix, 0, 0);
2367                    }
2368                    while (word--) {
2369                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2370                        if (tmp) {
2371                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2372                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2373                            else
2374                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2375                        }
2376                    }
2377                }
2378 #endif
2379                 if (trie->maxlen) {
2380                     convert = n;
2381                 } else {
2382                     NEXT_OFF(convert) = (U16)(tail - convert);
2383                     DEBUG_r(optimize= n);
2384                 }
2385             }
2386         }
2387         if (!jumper)
2388             jumper = last;
2389         if ( trie->maxlen ) {
2390             NEXT_OFF( convert ) = (U16)(tail - convert);
2391             ARG_SET( convert, data_slot );
2392             /* Store the offset to the first unabsorbed branch in
2393                jump[0], which is otherwise unused by the jump logic.
2394                We use this when dumping a trie and during optimisation. */
2395             if (trie->jump)
2396                 trie->jump[0] = (U16)(nextbranch - convert);
2397
2398             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2399              *   and there is a bitmap
2400              *   and the first "jump target" node we found leaves enough room
2401              * then convert the TRIE node into a TRIEC node, with the bitmap
2402              * embedded inline in the opcode - this is hypothetically faster.
2403              */
2404             if ( !trie->states[trie->startstate].wordnum
2405                  && trie->bitmap
2406                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2407             {
2408                 OP( convert ) = TRIEC;
2409                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2410                 PerlMemShared_free(trie->bitmap);
2411                 trie->bitmap= NULL;
2412             } else
2413                 OP( convert ) = TRIE;
2414
2415             /* store the type in the flags */
2416             convert->flags = nodetype;
2417             DEBUG_r({
2418             optimize = convert
2419                       + NODE_STEP_REGNODE
2420                       + regarglen[ OP( convert ) ];
2421             });
2422             /* XXX We really should free up the resource in trie now,
2423                    as we won't use them - (which resources?) dmq */
2424         }
2425         /* needed for dumping*/
2426         DEBUG_r(if (optimize) {
2427             regnode *opt = convert;
2428
2429             while ( ++opt < optimize) {
2430                 Set_Node_Offset_Length(opt,0,0);
2431             }
2432             /*
2433                 Try to clean up some of the debris left after the
2434                 optimisation.
2435              */
2436             while( optimize < jumper ) {
2437                 mjd_nodelen += Node_Length((optimize));
2438                 OP( optimize ) = OPTIMIZED;
2439                 Set_Node_Offset_Length(optimize,0,0);
2440                 optimize++;
2441             }
2442             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2443         });
2444     } /* end node insert */
2445     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2446
2447     /*  Finish populating the prev field of the wordinfo array.  Walk back
2448      *  from each accept state until we find another accept state, and if
2449      *  so, point the first word's .prev field at the second word. If the
2450      *  second already has a .prev field set, stop now. This will be the
2451      *  case either if we've already processed that word's accept state,
2452      *  or that state had multiple words, and the overspill words were
2453      *  already linked up earlier.
2454      */
2455     {
2456         U16 word;
2457         U32 state;
2458         U16 prev;
2459
2460         for (word=1; word <= trie->wordcount; word++) {
2461             prev = 0;
2462             if (trie->wordinfo[word].prev)
2463                 continue;
2464             state = trie->wordinfo[word].accept;
2465             while (state) {
2466                 state = prev_states[state];
2467                 if (!state)
2468                     break;
2469                 prev = trie->states[state].wordnum;
2470                 if (prev)
2471                     break;
2472             }
2473             trie->wordinfo[word].prev = prev;
2474         }
2475         Safefree(prev_states);
2476     }
2477
2478
2479     /* and now dump out the compressed format */
2480     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2481
2482     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2483 #ifdef DEBUGGING
2484     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2485     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2486 #else
2487     SvREFCNT_dec_NN(revcharmap);
2488 #endif
2489     return trie->jump
2490            ? MADE_JUMP_TRIE
2491            : trie->startstate>1
2492              ? MADE_EXACT_TRIE
2493              : MADE_TRIE;
2494 }
2495
2496 STATIC void
2497 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2498 {
2499 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2500
2501    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2502    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2503    ISBN 0-201-10088-6
2504
2505    We find the fail state for each state in the trie, this state is the longest proper
2506    suffix of the current state's 'word' that is also a proper prefix of another word in our
2507    trie. State 1 represents the word '' and is thus the default fail state. This allows
2508    the DFA not to have to restart after its tried and failed a word at a given point, it
2509    simply continues as though it had been matching the other word in the first place.
2510    Consider
2511       'abcdgu'=~/abcdefg|cdgu/
2512    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2513    fail, which would bring us to the state representing 'd' in the second word where we would
2514    try 'g' and succeed, proceeding to match 'cdgu'.
2515  */
2516  /* add a fail transition */
2517     const U32 trie_offset = ARG(source);
2518     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2519     U32 *q;
2520     const U32 ucharcount = trie->uniquecharcount;
2521     const U32 numstates = trie->statecount;
2522     const U32 ubound = trie->lasttrans + ucharcount;
2523     U32 q_read = 0;
2524     U32 q_write = 0;
2525     U32 charid;
2526     U32 base = trie->states[ 1 ].trans.base;
2527     U32 *fail;
2528     reg_ac_data *aho;
2529     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2530     GET_RE_DEBUG_FLAGS_DECL;
2531
2532     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2533 #ifndef DEBUGGING
2534     PERL_UNUSED_ARG(depth);
2535 #endif
2536
2537
2538     ARG_SET( stclass, data_slot );
2539     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2540     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2541     aho->trie=trie_offset;
2542     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2543     Copy( trie->states, aho->states, numstates, reg_trie_state );
2544     Newxz( q, numstates, U32);
2545     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2546     aho->refcount = 1;
2547     fail = aho->fail;
2548     /* initialize fail[0..1] to be 1 so that we always have
2549        a valid final fail state */
2550     fail[ 0 ] = fail[ 1 ] = 1;
2551
2552     for ( charid = 0; charid < ucharcount ; charid++ ) {
2553         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2554         if ( newstate ) {
2555             q[ q_write ] = newstate;
2556             /* set to point at the root */
2557             fail[ q[ q_write++ ] ]=1;
2558         }
2559     }
2560     while ( q_read < q_write) {
2561         const U32 cur = q[ q_read++ % numstates ];
2562         base = trie->states[ cur ].trans.base;
2563
2564         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2565             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2566             if (ch_state) {
2567                 U32 fail_state = cur;
2568                 U32 fail_base;
2569                 do {
2570                     fail_state = fail[ fail_state ];
2571                     fail_base = aho->states[ fail_state ].trans.base;
2572                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2573
2574                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2575                 fail[ ch_state ] = fail_state;
2576                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2577                 {
2578                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2579                 }
2580                 q[ q_write++ % numstates] = ch_state;
2581             }
2582         }
2583     }
2584     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2585        when we fail in state 1, this allows us to use the
2586        charclass scan to find a valid start char. This is based on the principle
2587        that theres a good chance the string being searched contains lots of stuff
2588        that cant be a start char.
2589      */
2590     fail[ 0 ] = fail[ 1 ] = 0;
2591     DEBUG_TRIE_COMPILE_r({
2592         PerlIO_printf(Perl_debug_log,
2593                       "%*sStclass Failtable (%"UVuf" states): 0",
2594                       (int)(depth * 2), "", (UV)numstates
2595         );
2596         for( q_read=1; q_read<numstates; q_read++ ) {
2597             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2598         }
2599         PerlIO_printf(Perl_debug_log, "\n");
2600     });
2601     Safefree(q);
2602     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2603 }
2604
2605
2606 /*
2607  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2608  * These need to be revisited when a newer toolchain becomes available.
2609  */
2610 #if defined(__sparc64__) && defined(__GNUC__)
2611 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2612 #       undef  SPARC64_GCC_WORKAROUND
2613 #       define SPARC64_GCC_WORKAROUND 1
2614 #   endif
2615 #endif
2616
2617 #define DEBUG_PEEP(str,scan,depth) \
2618     DEBUG_OPTIMISE_r({if (scan){ \
2619        SV * const mysv=sv_newmortal(); \
2620        regnode *Next = regnext(scan); \
2621        regprop(RExC_rx, mysv, scan); \
2622        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2623        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2624        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2625    }});
2626
2627
2628 /* The below joins as many adjacent EXACTish nodes as possible into a single
2629  * one.  The regop may be changed if the node(s) contain certain sequences that
2630  * require special handling.  The joining is only done if:
2631  * 1) there is room in the current conglomerated node to entirely contain the
2632  *    next one.
2633  * 2) they are the exact same node type
2634  *
2635  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2636  * these get optimized out
2637  *
2638  * If a node is to match under /i (folded), the number of characters it matches
2639  * can be different than its character length if it contains a multi-character
2640  * fold.  *min_subtract is set to the total delta of the input nodes.
2641  *
2642  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2643  * and contains LATIN SMALL LETTER SHARP S
2644  *
2645  * This is as good a place as any to discuss the design of handling these
2646  * multi-character fold sequences.  It's been wrong in Perl for a very long
2647  * time.  There are three code points in Unicode whose multi-character folds
2648  * were long ago discovered to mess things up.  The previous designs for
2649  * dealing with these involved assigning a special node for them.  This
2650  * approach doesn't work, as evidenced by this example:
2651  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2652  * Both these fold to "sss", but if the pattern is parsed to create a node that
2653  * would match just the \xDF, it won't be able to handle the case where a
2654  * successful match would have to cross the node's boundary.  The new approach
2655  * that hopefully generally solves the problem generates an EXACTFU_SS node
2656  * that is "sss".
2657  *
2658  * It turns out that there are problems with all multi-character folds, and not
2659  * just these three.  Now the code is general, for all such cases, but the
2660  * three still have some special handling.  The approach taken is:
2661  * 1)   This routine examines each EXACTFish node that could contain multi-
2662  *      character fold sequences.  It returns in *min_subtract how much to
2663  *      subtract from the the actual length of the string to get a real minimum
2664  *      match length; it is 0 if there are no multi-char folds.  This delta is
2665  *      used by the caller to adjust the min length of the match, and the delta
2666  *      between min and max, so that the optimizer doesn't reject these
2667  *      possibilities based on size constraints.
2668  * 2)   Certain of these sequences require special handling by the trie code,
2669  *      so, if found, this code changes the joined node type to special ops:
2670  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2671  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2672  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2673  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2674  *      there is a possible fold length change.  That means that a regular
2675  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2676  *      with length changes, and so can be processed faster.  regexec.c takes
2677  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2678  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2679  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2680  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2681  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2682  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2683  *      possibilities for the non-UTF8 patterns are quite simple, except for
2684  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2685  *      members of a fold-pair, and arrays are set up for all of them so that
2686  *      the other member of the pair can be found quickly.  Code elsewhere in
2687  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2688  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2689  *      described in the next item.
2690  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2691  *      'ss' or not is not knowable at compile time.  It will match iff the
2692  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2693  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2694  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2695  *      described in item 3).  An assumption that the optimizer part of
2696  *      regexec.c (probably unwittingly) makes is that a character in the
2697  *      pattern corresponds to at most a single character in the target string.
2698  *      (And I do mean character, and not byte here, unlike other parts of the
2699  *      documentation that have never been updated to account for multibyte
2700  *      Unicode.)  This assumption is wrong only in this case, as all other
2701  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2702  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2703  *      reluctant to try to change this assumption, so instead the code punts.
2704  *      This routine examines EXACTF nodes for the sharp s, and returns a
2705  *      boolean indicating whether or not the node is an EXACTF node that
2706  *      contains a sharp s.  When it is true, the caller sets a flag that later
2707  *      causes the optimizer in this file to not set values for the floating
2708  *      and fixed string lengths, and thus avoids the optimizer code in
2709  *      regexec.c that makes the invalid assumption.  Thus, there is no
2710  *      optimization based on string lengths for EXACTF nodes that contain the
2711  *      sharp s.  This only happens for /id rules (which means the pattern
2712  *      isn't in UTF-8).
2713  */
2714
2715 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2716     if (PL_regkind[OP(scan)] == EXACT) \
2717         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2718
2719 STATIC U32
2720 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2721     /* Merge several consecutive EXACTish nodes into one. */
2722     regnode *n = regnext(scan);
2723     U32 stringok = 1;
2724     regnode *next = scan + NODE_SZ_STR(scan);
2725     U32 merged = 0;
2726     U32 stopnow = 0;
2727 #ifdef DEBUGGING
2728     regnode *stop = scan;
2729     GET_RE_DEBUG_FLAGS_DECL;
2730 #else
2731     PERL_UNUSED_ARG(depth);
2732 #endif
2733
2734     PERL_ARGS_ASSERT_JOIN_EXACT;
2735 #ifndef EXPERIMENTAL_INPLACESCAN
2736     PERL_UNUSED_ARG(flags);
2737     PERL_UNUSED_ARG(val);
2738 #endif
2739     DEBUG_PEEP("join",scan,depth);
2740
2741     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2742      * EXACT ones that are mergeable to the current one. */
2743     while (n
2744            && (PL_regkind[OP(n)] == NOTHING
2745                || (stringok && OP(n) == OP(scan)))
2746            && NEXT_OFF(n)
2747            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2748     {
2749
2750         if (OP(n) == TAIL || n > next)
2751             stringok = 0;
2752         if (PL_regkind[OP(n)] == NOTHING) {
2753             DEBUG_PEEP("skip:",n,depth);
2754             NEXT_OFF(scan) += NEXT_OFF(n);
2755             next = n + NODE_STEP_REGNODE;
2756 #ifdef DEBUGGING
2757             if (stringok)
2758                 stop = n;
2759 #endif
2760             n = regnext(n);
2761         }
2762         else if (stringok) {
2763             const unsigned int oldl = STR_LEN(scan);
2764             regnode * const nnext = regnext(n);
2765
2766             /* XXX I (khw) kind of doubt that this works on platforms where
2767              * U8_MAX is above 255 because of lots of other assumptions */
2768             /* Don't join if the sum can't fit into a single node */
2769             if (oldl + STR_LEN(n) > U8_MAX)
2770                 break;
2771
2772             DEBUG_PEEP("merg",n,depth);
2773             merged++;
2774
2775             NEXT_OFF(scan) += NEXT_OFF(n);
2776             STR_LEN(scan) += STR_LEN(n);
2777             next = n + NODE_SZ_STR(n);
2778             /* Now we can overwrite *n : */
2779             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2780 #ifdef DEBUGGING
2781             stop = next - 1;
2782 #endif
2783             n = nnext;
2784             if (stopnow) break;
2785         }
2786
2787 #ifdef EXPERIMENTAL_INPLACESCAN
2788         if (flags && !NEXT_OFF(n)) {
2789             DEBUG_PEEP("atch", val, depth);
2790             if (reg_off_by_arg[OP(n)]) {
2791                 ARG_SET(n, val - n);
2792             }
2793             else {
2794                 NEXT_OFF(n) = val - n;
2795             }
2796             stopnow = 1;
2797         }
2798 #endif
2799     }
2800
2801     *min_subtract = 0;
2802     *has_exactf_sharp_s = FALSE;
2803
2804     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2805      * can now analyze for sequences of problematic code points.  (Prior to
2806      * this final joining, sequences could have been split over boundaries, and
2807      * hence missed).  The sequences only happen in folding, hence for any
2808      * non-EXACT EXACTish node */
2809     if (OP(scan) != EXACT) {
2810         const U8 * const s0 = (U8*) STRING(scan);
2811         const U8 * s = s0;
2812         const U8 * const s_end = s0 + STR_LEN(scan);
2813
2814         /* One pass is made over the node's string looking for all the
2815          * possibilities.  to avoid some tests in the loop, there are two main
2816          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2817          * non-UTF-8 */
2818         if (UTF) {
2819
2820             /* Examine the string for a multi-character fold sequence.  UTF-8
2821              * patterns have all characters pre-folded by the time this code is
2822              * executed */
2823             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2824                                      length sequence we are looking for is 2 */
2825             {
2826                 int count = 0;
2827                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2828                 if (! len) {    /* Not a multi-char fold: get next char */
2829                     s += UTF8SKIP(s);
2830                     continue;
2831                 }
2832
2833                 /* Nodes with 'ss' require special handling, except for EXACTFL
2834                  * and EXACTFA for which there is no multi-char fold to this */
2835                 if (len == 2 && *s == 's' && *(s+1) == 's'
2836                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2837                 {
2838                     count = 2;
2839                     OP(scan) = EXACTFU_SS;
2840                     s += 2;
2841                 }
2842                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2843                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2844                                       COMBINING_DIAERESIS_UTF8
2845                                       COMBINING_ACUTE_ACCENT_UTF8,
2846                                    6)
2847                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2848                                          COMBINING_DIAERESIS_UTF8
2849                                          COMBINING_ACUTE_ACCENT_UTF8,
2850                                      6)))
2851                 {
2852                     count = 3;
2853
2854                     /* These two folds require special handling by trie's, so
2855                      * change the node type to indicate this.  If EXACTFA and
2856                      * EXACTFL were ever to be handled by trie's, this would
2857                      * have to be changed.  If this node has already been
2858                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2859                      * (khw) think it doesn't matter in regexec.c for UTF
2860                      * patterns, but no need to change it */
2861                     if (OP(scan) == EXACTFU) {
2862                         OP(scan) = EXACTFU_TRICKYFOLD;
2863                     }
2864                     s += 6;
2865                 }
2866                 else { /* Here is a generic multi-char fold. */
2867                     const U8* multi_end  = s + len;
2868
2869                     /* Count how many characters in it.  In the case of /l and
2870                      * /aa, no folds which contain ASCII code points are
2871                      * allowed, so check for those, and skip if found.  (In
2872                      * EXACTFL, no folds are allowed to any Latin1 code point,
2873                      * not just ASCII.  But there aren't any of these
2874                      * currently, nor ever likely, so don't take the time to
2875                      * test for them.  The code that generates the
2876                      * is_MULTI_foo() macros croaks should one actually get put
2877                      * into Unicode .) */
2878                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2879                         count = utf8_length(s, multi_end);
2880                         s = multi_end;
2881                     }
2882                     else {
2883                         while (s < multi_end) {
2884                             if (isASCII(*s)) {
2885                                 s++;
2886                                 goto next_iteration;
2887                             }
2888                             else {
2889                                 s += UTF8SKIP(s);
2890                             }
2891                             count++;
2892                         }
2893                     }
2894                 }
2895
2896                 /* The delta is how long the sequence is minus 1 (1 is how long
2897                  * the character that folds to the sequence is) */
2898                 *min_subtract += count - 1;
2899             next_iteration: ;
2900             }
2901         }
2902         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2903
2904             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2905              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2906              * nodes can't have multi-char folds to this range (and there are
2907              * no existing ones in the upper latin1 range).  In the EXACTF
2908              * case we look also for the sharp s, which can be in the final
2909              * position.  Otherwise we can stop looking 1 byte earlier because
2910              * have to find at least two characters for a multi-fold */
2911             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2912
2913             /* The below is perhaps overboard, but this allows us to save a
2914              * test each time through the loop at the expense of a mask.  This
2915              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2916              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2917              * are 64.  This uses an exclusive 'or' to find that bit and then
2918              * inverts it to form a mask, with just a single 0, in the bit
2919              * position where 'S' and 's' differ. */
2920             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2921             const U8 s_masked = 's' & S_or_s_mask;
2922
2923             while (s < upper) {
2924                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2925                 if (! len) {    /* Not a multi-char fold. */
2926                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2927                     {
2928                         *has_exactf_sharp_s = TRUE;
2929                     }
2930                     s++;
2931                     continue;
2932                 }
2933
2934                 if (len == 2
2935                     && ((*s & S_or_s_mask) == s_masked)
2936                     && ((*(s+1) & S_or_s_mask) == s_masked))
2937                 {
2938
2939                     /* EXACTF nodes need to know that the minimum length
2940                      * changed so that a sharp s in the string can match this
2941                      * ss in the pattern, but they remain EXACTF nodes, as they
2942                      * won't match this unless the target string is is UTF-8,
2943                      * which we don't know until runtime */
2944                     if (OP(scan) != EXACTF) {
2945                         OP(scan) = EXACTFU_SS;
2946                     }
2947                 }
2948
2949                 *min_subtract += len - 1;
2950                 s += len;
2951             }
2952         }
2953     }
2954
2955 #ifdef DEBUGGING
2956     /* Allow dumping but overwriting the collection of skipped
2957      * ops and/or strings with fake optimized ops */
2958     n = scan + NODE_SZ_STR(scan);
2959     while (n <= stop) {
2960         OP(n) = OPTIMIZED;
2961         FLAGS(n) = 0;
2962         NEXT_OFF(n) = 0;
2963         n++;
2964     }
2965 #endif
2966     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2967     return stopnow;
2968 }
2969
2970 /* REx optimizer.  Converts nodes into quicker variants "in place".
2971    Finds fixed substrings.  */
2972
2973 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2974    to the position after last scanned or to NULL. */
2975
2976 #define INIT_AND_WITHP \
2977     assert(!and_withp); \
2978     Newx(and_withp,1,struct regnode_charclass_class); \
2979     SAVEFREEPV(and_withp)
2980
2981 /* this is a chain of data about sub patterns we are processing that
2982    need to be handled separately/specially in study_chunk. Its so
2983    we can simulate recursion without losing state.  */
2984 struct scan_frame;
2985 typedef struct scan_frame {
2986     regnode *last;  /* last node to process in this frame */
2987     regnode *next;  /* next node to process when last is reached */
2988     struct scan_frame *prev; /*previous frame*/
2989     I32 stop; /* what stopparen do we use */
2990 } scan_frame;
2991
2992
2993 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2994
2995 STATIC I32
2996 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2997                         I32 *minlenp, I32 *deltap,
2998                         regnode *last,
2999                         scan_data_t *data,
3000                         I32 stopparen,
3001                         U8* recursed,
3002                         struct regnode_charclass_class *and_withp,
3003                         U32 flags, U32 depth)
3004                         /* scanp: Start here (read-write). */
3005                         /* deltap: Write maxlen-minlen here. */
3006                         /* last: Stop before this one. */
3007                         /* data: string data about the pattern */
3008                         /* stopparen: treat close N as END */
3009                         /* recursed: which subroutines have we recursed into */
3010                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3011 {
3012     dVAR;
3013     I32 min = 0;    /* There must be at least this number of characters to match */
3014     I32 pars = 0, code;
3015     regnode *scan = *scanp, *next;
3016     I32 delta = 0;
3017     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3018     int is_inf_internal = 0;            /* The studied chunk is infinite */
3019     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3020     scan_data_t data_fake;
3021     SV *re_trie_maxbuff = NULL;
3022     regnode *first_non_open = scan;
3023     I32 stopmin = I32_MAX;
3024     scan_frame *frame = NULL;
3025     GET_RE_DEBUG_FLAGS_DECL;
3026
3027     PERL_ARGS_ASSERT_STUDY_CHUNK;
3028
3029 #ifdef DEBUGGING
3030     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3031 #endif
3032
3033     if ( depth == 0 ) {
3034         while (first_non_open && OP(first_non_open) == OPEN)
3035             first_non_open=regnext(first_non_open);
3036     }
3037
3038
3039   fake_study_recurse:
3040     while ( scan && OP(scan) != END && scan < last ){
3041         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3042                                    node length to get a real minimum (because
3043                                    the folded version may be shorter) */
3044         bool has_exactf_sharp_s = FALSE;
3045         /* Peephole optimizer: */
3046         DEBUG_STUDYDATA("Peep:", data,depth);
3047         DEBUG_PEEP("Peep",scan,depth);
3048
3049         /* Its not clear to khw or hv why this is done here, and not in the
3050          * clauses that deal with EXACT nodes.  khw's guess is that it's
3051          * because of a previous design */
3052         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3053
3054         /* Follow the next-chain of the current node and optimize
3055            away all the NOTHINGs from it.  */
3056         if (OP(scan) != CURLYX) {
3057             const int max = (reg_off_by_arg[OP(scan)]
3058                        ? I32_MAX
3059                        /* I32 may be smaller than U16 on CRAYs! */
3060                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3061             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3062             int noff;
3063             regnode *n = scan;
3064
3065             /* Skip NOTHING and LONGJMP. */
3066             while ((n = regnext(n))
3067                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3068                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3069                    && off + noff < max)
3070                 off += noff;
3071             if (reg_off_by_arg[OP(scan)])
3072                 ARG(scan) = off;
3073             else
3074                 NEXT_OFF(scan) = off;
3075         }
3076
3077
3078
3079         /* The principal pseudo-switch.  Cannot be a switch, since we
3080            look into several different things.  */
3081         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3082                    || OP(scan) == IFTHEN) {
3083             next = regnext(scan);
3084             code = OP(scan);
3085             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3086
3087             if (OP(next) == code || code == IFTHEN) {
3088                 /* NOTE - There is similar code to this block below for handling
3089                    TRIE nodes on a re-study.  If you change stuff here check there
3090                    too. */
3091                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3092                 struct regnode_charclass_class accum;
3093                 regnode * const startbranch=scan;
3094
3095                 if (flags & SCF_DO_SUBSTR)
3096                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3097                 if (flags & SCF_DO_STCLASS)
3098                     cl_init_zero(pRExC_state, &accum);
3099
3100                 while (OP(scan) == code) {
3101                     I32 deltanext, minnext, f = 0, fake;
3102                     struct regnode_charclass_class this_class;
3103
3104                     num++;
3105                     data_fake.flags = 0;
3106                     if (data) {
3107                         data_fake.whilem_c = data->whilem_c;
3108                         data_fake.last_closep = data->last_closep;
3109                     }
3110                     else
3111                         data_fake.last_closep = &fake;
3112
3113                     data_fake.pos_delta = delta;
3114                     next = regnext(scan);
3115                     scan = NEXTOPER(scan);
3116                     if (code != BRANCH)
3117                         scan = NEXTOPER(scan);
3118                     if (flags & SCF_DO_STCLASS) {
3119                         cl_init(pRExC_state, &this_class);
3120                         data_fake.start_class = &this_class;
3121                         f = SCF_DO_STCLASS_AND;
3122                     }
3123                     if (flags & SCF_WHILEM_VISITED_POS)
3124                         f |= SCF_WHILEM_VISITED_POS;
3125
3126                     /* we suppose the run is continuous, last=next...*/
3127                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3128                                           next, &data_fake,
3129                                           stopparen, recursed, NULL, f,depth+1);
3130                     if (min1 > minnext)
3131                         min1 = minnext;
3132                     if (deltanext == I32_MAX) {
3133                         is_inf = is_inf_internal = 1;
3134                         max1 = I32_MAX;
3135                     } else if (max1 < minnext + deltanext)
3136                         max1 = minnext + deltanext;
3137                     scan = next;
3138                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3139                         pars++;
3140                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3141                         if ( stopmin > minnext)
3142                             stopmin = min + min1;
3143                         flags &= ~SCF_DO_SUBSTR;
3144                         if (data)
3145                             data->flags |= SCF_SEEN_ACCEPT;
3146                     }
3147                     if (data) {
3148                         if (data_fake.flags & SF_HAS_EVAL)
3149                             data->flags |= SF_HAS_EVAL;
3150                         data->whilem_c = data_fake.whilem_c;
3151                     }
3152                     if (flags & SCF_DO_STCLASS)
3153                         cl_or(pRExC_state, &accum, &this_class);
3154                 }
3155                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3156                     min1 = 0;
3157                 if (flags & SCF_DO_SUBSTR) {
3158                     data->pos_min += min1;
3159                     if (data->pos_delta >= I32_MAX - (max1 - min1))
3160                         data->pos_delta = I32_MAX;
3161                     else
3162                         data->pos_delta += max1 - min1;
3163                     if (max1 != min1 || is_inf)
3164                         data->longest = &(data->longest_float);
3165                 }
3166                 min += min1;
3167                 if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3168                     delta = I32_MAX;
3169                 else
3170                     delta += max1 - min1;
3171                 if (flags & SCF_DO_STCLASS_OR) {
3172                     cl_or(pRExC_state, data->start_class, &accum);
3173                     if (min1) {
3174                         cl_and(data->start_class, and_withp);
3175                         flags &= ~SCF_DO_STCLASS;
3176                     }
3177                 }
3178                 else if (flags & SCF_DO_STCLASS_AND) {
3179                     if (min1) {
3180                         cl_and(data->start_class, &accum);
3181                         flags &= ~SCF_DO_STCLASS;
3182                     }
3183                     else {
3184                         /* Switch to OR mode: cache the old value of
3185                          * data->start_class */
3186                         INIT_AND_WITHP;
3187                         StructCopy(data->start_class, and_withp,
3188                                    struct regnode_charclass_class);
3189                         flags &= ~SCF_DO_STCLASS_AND;
3190                         StructCopy(&accum, data->start_class,
3191                                    struct regnode_charclass_class);
3192                         flags |= SCF_DO_STCLASS_OR;
3193                         SET_SSC_EOS(data->start_class);
3194                     }
3195                 }
3196
3197                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3198                 /* demq.
3199
3200                    Assuming this was/is a branch we are dealing with: 'scan' now
3201                    points at the item that follows the branch sequence, whatever
3202                    it is. We now start at the beginning of the sequence and look
3203                    for subsequences of
3204
3205                    BRANCH->EXACT=>x1
3206                    BRANCH->EXACT=>x2
3207                    tail
3208
3209                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3210
3211                    If we can find such a subsequence we need to turn the first
3212                    element into a trie and then add the subsequent branch exact
3213                    strings to the trie.
3214
3215                    We have two cases
3216
3217                      1. patterns where the whole set of branches can be converted.
3218
3219                      2. patterns where only a subset can be converted.
3220
3221                    In case 1 we can replace the whole set with a single regop
3222                    for the trie. In case 2 we need to keep the start and end
3223                    branches so
3224
3225                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3226                      becomes BRANCH TRIE; BRANCH X;
3227
3228                   There is an additional case, that being where there is a
3229                   common prefix, which gets split out into an EXACT like node
3230                   preceding the TRIE node.
3231
3232                   If x(1..n)==tail then we can do a simple trie, if not we make
3233                   a "jump" trie, such that when we match the appropriate word
3234                   we "jump" to the appropriate tail node. Essentially we turn
3235                   a nested if into a case structure of sorts.
3236
3237                 */
3238
3239                     int made=0;
3240                     if (!re_trie_maxbuff) {
3241                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3242                         if (!SvIOK(re_trie_maxbuff))
3243                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3244                     }
3245                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3246                         regnode *cur;
3247                         regnode *first = (regnode *)NULL;
3248                         regnode *last = (regnode *)NULL;
3249                         regnode *tail = scan;
3250                         U8 trietype = 0;
3251                         U32 count=0;
3252
3253 #ifdef DEBUGGING
3254                         SV * const mysv = sv_newmortal();       /* for dumping */
3255 #endif
3256                         /* var tail is used because there may be a TAIL
3257                            regop in the way. Ie, the exacts will point to the
3258                            thing following the TAIL, but the last branch will
3259                            point at the TAIL. So we advance tail. If we
3260                            have nested (?:) we may have to move through several
3261                            tails.
3262                          */
3263
3264                         while ( OP( tail ) == TAIL ) {
3265                             /* this is the TAIL generated by (?:) */
3266                             tail = regnext( tail );
3267                         }
3268
3269
3270                         DEBUG_TRIE_COMPILE_r({
3271                             regprop(RExC_rx, mysv, tail );
3272                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3273                                 (int)depth * 2 + 2, "",
3274                                 "Looking for TRIE'able sequences. Tail node is: ",
3275                                 SvPV_nolen_const( mysv )
3276                             );
3277                         });
3278
3279                         /*
3280
3281                             Step through the branches
3282                                 cur represents each branch,
3283                                 noper is the first thing to be matched as part of that branch
3284                                 noper_next is the regnext() of that node.
3285
3286                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3287                             via a "jump trie" but we also support building with NOJUMPTRIE,
3288                             which restricts the trie logic to structures like /FOO|BAR/.
3289
3290                             If noper is a trieable nodetype then the branch is a possible optimization
3291                             target. If we are building under NOJUMPTRIE then we require that noper_next
3292                             is the same as scan (our current position in the regex program).
3293
3294                             Once we have two or more consecutive such branches we can create a
3295                             trie of the EXACT's contents and stitch it in place into the program.
3296
3297                             If the sequence represents all of the branches in the alternation we
3298                             replace the entire thing with a single TRIE node.
3299
3300                             Otherwise when it is a subsequence we need to stitch it in place and
3301                             replace only the relevant branches. This means the first branch has
3302                             to remain as it is used by the alternation logic, and its next pointer,
3303                             and needs to be repointed at the item on the branch chain following
3304                             the last branch we have optimized away.
3305
3306                             This could be either a BRANCH, in which case the subsequence is internal,
3307                             or it could be the item following the branch sequence in which case the
3308                             subsequence is at the end (which does not necessarily mean the first node
3309                             is the start of the alternation).
3310
3311                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3312
3313                                 optype          |  trietype
3314                                 ----------------+-----------
3315                                 NOTHING         | NOTHING
3316                                 EXACT           | EXACT
3317                                 EXACTFU         | EXACTFU
3318                                 EXACTFU_SS      | EXACTFU
3319                                 EXACTFU_TRICKYFOLD | EXACTFU
3320                                 EXACTFA         | 0
3321
3322
3323                         */
3324 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3325                        ( EXACT == (X) )   ? EXACT :        \
3326                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3327                        0 )
3328
3329                         /* dont use tail as the end marker for this traverse */
3330                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3331                             regnode * const noper = NEXTOPER( cur );
3332                             U8 noper_type = OP( noper );
3333                             U8 noper_trietype = TRIE_TYPE( noper_type );
3334 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3335                             regnode * const noper_next = regnext( noper );
3336                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3337                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3338 #endif
3339
3340                             DEBUG_TRIE_COMPILE_r({
3341                                 regprop(RExC_rx, mysv, cur);
3342                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3343                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3344
3345                                 regprop(RExC_rx, mysv, noper);
3346                                 PerlIO_printf( Perl_debug_log, " -> %s",
3347                                     SvPV_nolen_const(mysv));
3348
3349                                 if ( noper_next ) {
3350                                   regprop(RExC_rx, mysv, noper_next );
3351                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3352                                     SvPV_nolen_const(mysv));
3353                                 }
3354                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3355                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3356                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3357                                 );
3358                             });
3359
3360                             /* Is noper a trieable nodetype that can be merged with the
3361                              * current trie (if there is one)? */
3362                             if ( noper_trietype
3363                                   &&
3364                                   (
3365                                         ( noper_trietype == NOTHING)
3366                                         || ( trietype == NOTHING )
3367                                         || ( trietype == noper_trietype )
3368                                   )
3369 #ifdef NOJUMPTRIE
3370                                   && noper_next == tail
3371 #endif
3372                                   && count < U16_MAX)
3373                             {
3374                                 /* Handle mergable triable node
3375                                  * Either we are the first node in a new trieable sequence,
3376                                  * in which case we do some bookkeeping, otherwise we update
3377                                  * the end pointer. */
3378                                 if ( !first ) {
3379                                     first = cur;
3380                                     if ( noper_trietype == NOTHING ) {
3381 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3382                                         regnode * const noper_next = regnext( noper );
3383                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3384                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3385 #endif
3386
3387                                         if ( noper_next_trietype ) {
3388                                             trietype = noper_next_trietype;
3389                                         } else if (noper_next_type)  {
3390                                             /* a NOTHING regop is 1 regop wide. We need at least two
3391                                              * for a trie so we can't merge this in */
3392                                             first = NULL;
3393                                         }
3394                                     } else {
3395                                         trietype = noper_trietype;
3396                                     }
3397                                 } else {
3398                                     if ( trietype == NOTHING )
3399                                         trietype = noper_trietype;
3400                                     last = cur;
3401                                 }
3402                                 if (first)
3403                                     count++;
3404                             } /* end handle mergable triable node */
3405                             else {
3406                                 /* handle unmergable node -
3407                                  * noper may either be a triable node which can not be tried
3408                                  * together with the current trie, or a non triable node */
3409                                 if ( last ) {
3410                                     /* If last is set and trietype is not NOTHING then we have found
3411                                      * at least two triable branch sequences in a row of a similar
3412                                      * trietype so we can turn them into a trie. If/when we
3413                                      * allow NOTHING to start a trie sequence this condition will be
3414                                      * required, and it isn't expensive so we leave it in for now. */
3415                                     if ( trietype && trietype != NOTHING )
3416                                         make_trie( pRExC_state,
3417                                                 startbranch, first, cur, tail, count,
3418                                                 trietype, depth+1 );
3419                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3420                                 }
3421                                 if ( noper_trietype
3422 #ifdef NOJUMPTRIE
3423                                      && noper_next == tail
3424 #endif
3425                                 ){
3426                                     /* noper is triable, so we can start a new trie sequence */
3427                                     count = 1;
3428                                     first = cur;
3429                                     trietype = noper_trietype;
3430                                 } else if (first) {
3431                                     /* if we already saw a first but the current node is not triable then we have
3432                                      * to reset the first information. */
3433                                     count = 0;
3434                                     first = NULL;
3435                                     trietype = 0;
3436                                 }
3437                             } /* end handle unmergable node */
3438                         } /* loop over branches */
3439                         DEBUG_TRIE_COMPILE_r({
3440                             regprop(RExC_rx, mysv, cur);
3441                             PerlIO_printf( Perl_debug_log,
3442                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3443                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3444
3445                         });
3446                         if ( last && trietype ) {
3447                             if ( trietype != NOTHING ) {
3448                                 /* the last branch of the sequence was part of a trie,
3449                                  * so we have to construct it here outside of the loop
3450                                  */
3451                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3452 #ifdef TRIE_STUDY_OPT
3453                                 if ( ((made == MADE_EXACT_TRIE &&
3454                                      startbranch == first)
3455                                      || ( first_non_open == first )) &&
3456                                      depth==0 ) {
3457                                     flags |= SCF_TRIE_RESTUDY;
3458                                     if ( startbranch == first
3459                                          && scan == tail )
3460                                     {
3461                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3462                                     }
3463                                 }
3464 #endif
3465                             } else {
3466                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3467                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3468                                  */
3469                                 if ( startbranch == first ) {
3470                                     regnode *opt;
3471                                     /* the entire thing is a NOTHING sequence, something like this:
3472                                      * (?:|) So we can turn it into a plain NOTHING op. */
3473                                     DEBUG_TRIE_COMPILE_r({
3474                                         regprop(RExC_rx, mysv, cur);
3475                                         PerlIO_printf( Perl_debug_log,
3476                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3477                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3478
3479                                     });
3480                                     OP(startbranch)= NOTHING;
3481                                     NEXT_OFF(startbranch)= tail - startbranch;
3482                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3483                                         OP(opt)= OPTIMIZED;
3484                                 }
3485                             }
3486                         } /* end if ( last) */
3487                     } /* TRIE_MAXBUF is non zero */
3488
3489                 } /* do trie */
3490
3491             }
3492             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3493                 scan = NEXTOPER(NEXTOPER(scan));
3494             } else                      /* single branch is optimized. */
3495                 scan = NEXTOPER(scan);
3496             continue;
3497         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3498             scan_frame *newframe = NULL;
3499             I32 paren;
3500             regnode *start;
3501             regnode *end;
3502
3503             if (OP(scan) != SUSPEND) {
3504             /* set the pointer */
3505                 if (OP(scan) == GOSUB) {
3506                     paren = ARG(scan);
3507                     RExC_recurse[ARG2L(scan)] = scan;
3508                     start = RExC_open_parens[paren-1];
3509                     end   = RExC_close_parens[paren-1];
3510                 } else {
3511                     paren = 0;
3512                     start = RExC_rxi->program + 1;
3513                     end   = RExC_opend;
3514                 }
3515                 if (!recursed) {
3516                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3517                     SAVEFREEPV(recursed);
3518                 }
3519                 if (!PAREN_TEST(recursed,paren+1)) {
3520                     PAREN_SET(recursed,paren+1);
3521                     Newx(newframe,1,scan_frame);
3522                 } else {
3523                     if (flags & SCF_DO_SUBSTR) {
3524                         SCAN_COMMIT(pRExC_state,data,minlenp);
3525                         data->longest = &(data->longest_float);
3526                     }
3527                     is_inf = is_inf_internal = 1;
3528                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3529                         cl_anything(pRExC_state, data->start_class);
3530                     flags &= ~SCF_DO_STCLASS;
3531                 }
3532             } else {
3533                 Newx(newframe,1,scan_frame);
3534                 paren = stopparen;
3535                 start = scan+2;
3536                 end = regnext(scan);
3537             }
3538             if (newframe) {
3539                 assert(start);
3540                 assert(end);
3541                 SAVEFREEPV(newframe);
3542                 newframe->next = regnext(scan);
3543                 newframe->last = last;
3544                 newframe->stop = stopparen;
3545                 newframe->prev = frame;
3546
3547                 frame = newframe;
3548                 scan =  start;
3549                 stopparen = paren;
3550                 last = end;
3551
3552                 continue;
3553             }
3554         }
3555         else if (OP(scan) == EXACT) {
3556             I32 l = STR_LEN(scan);
3557             UV uc;
3558             if (UTF) {
3559                 const U8 * const s = (U8*)STRING(scan);
3560                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3561                 l = utf8_length(s, s + l);
3562             } else {
3563                 uc = *((U8*)STRING(scan));
3564             }
3565             min += l;
3566             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3567                 /* The code below prefers earlier match for fixed
3568                    offset, later match for variable offset.  */
3569                 if (data->last_end == -1) { /* Update the start info. */
3570                     data->last_start_min = data->pos_min;
3571                     data->last_start_max = is_inf
3572                         ? I32_MAX : data->pos_min + data->pos_delta;
3573                 }
3574                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3575                 if (UTF)
3576                     SvUTF8_on(data->last_found);
3577                 {
3578                     SV * const sv = data->last_found;
3579                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3580                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3581                     if (mg && mg->mg_len >= 0)
3582                         mg->mg_len += utf8_length((U8*)STRING(scan),
3583                                                   (U8*)STRING(scan)+STR_LEN(scan));
3584                 }
3585                 data->last_end = data->pos_min + l;
3586                 data->pos_min += l; /* As in the first entry. */
3587                 data->flags &= ~SF_BEFORE_EOL;
3588             }
3589             if (flags & SCF_DO_STCLASS_AND) {
3590                 /* Check whether it is compatible with what we know already! */
3591                 int compat = 1;
3592
3593
3594                 /* If compatible, we or it in below.  It is compatible if is
3595                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3596                  * it's for a locale.  Even if there isn't unicode semantics
3597                  * here, at runtime there may be because of matching against a
3598                  * utf8 string, so accept a possible false positive for
3599                  * latin1-range folds */
3600                 if (uc >= 0x100 ||
3601                     (!(data->start_class->flags & ANYOF_LOCALE)
3602                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3603                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3604                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3605                     )
3606                 {
3607                     compat = 0;
3608                 }
3609                 ANYOF_CLASS_ZERO(data->start_class);
3610                 ANYOF_BITMAP_ZERO(data->start_class);
3611                 if (compat)
3612                     ANYOF_BITMAP_SET(data->start_class, uc);
3613                 else if (uc >= 0x100) {
3614                     int i;
3615
3616                     /* Some Unicode code points fold to the Latin1 range; as
3617                      * XXX temporary code, instead of figuring out if this is
3618                      * one, just assume it is and set all the start class bits
3619                      * that could be some such above 255 code point's fold
3620                      * which will generate fals positives.  As the code
3621                      * elsewhere that does compute the fold settles down, it
3622                      * can be extracted out and re-used here */
3623                     for (i = 0; i < 256; i++){
3624                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3625                             ANYOF_BITMAP_SET(data->start_class, i);
3626                         }
3627                     }
3628                 }
3629                 CLEAR_SSC_EOS(data->start_class);
3630                 if (uc < 0x100)
3631                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3632             }
3633             else if (flags & SCF_DO_STCLASS_OR) {
3634                 /* false positive possible if the class is case-folded */
3635                 if (uc < 0x100)
3636                     ANYOF_BITMAP_SET(data->start_class, uc);
3637                 else
3638                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3639                 CLEAR_SSC_EOS(data->start_class);
3640                 cl_and(data->start_class, and_withp);
3641             }
3642             flags &= ~SCF_DO_STCLASS;
3643         }
3644         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3645             I32 l = STR_LEN(scan);
3646             UV uc = *((U8*)STRING(scan));
3647
3648             /* Search for fixed substrings supports EXACT only. */
3649             if (flags & SCF_DO_SUBSTR) {
3650                 assert(data);
3651                 SCAN_COMMIT(pRExC_state, data, minlenp);
3652             }
3653             if (UTF) {
3654                 const U8 * const s = (U8 *)STRING(scan);
3655                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3656                 l = utf8_length(s, s + l);
3657             }
3658             if (has_exactf_sharp_s) {
3659                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3660             }
3661             min += l - min_subtract;
3662             assert (min >= 0);
3663             delta += min_subtract;
3664             if (flags & SCF_DO_SUBSTR) {
3665                 data->pos_min += l - min_subtract;
3666                 if (data->pos_min < 0) {
3667                     data->pos_min = 0;
3668                 }
3669                 data->pos_delta += min_subtract;
3670                 if (min_subtract) {
3671                     data->longest = &(data->longest_float);
3672                 }
3673             }
3674             if (flags & SCF_DO_STCLASS_AND) {
3675                 /* Check whether it is compatible with what we know already! */
3676                 int compat = 1;
3677                 if (uc >= 0x100 ||
3678                  (!(data->start_class->flags & ANYOF_LOCALE)
3679                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3680                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3681                 {
3682                     compat = 0;
3683                 }
3684                 ANYOF_CLASS_ZERO(data->start_class);
3685                 ANYOF_BITMAP_ZERO(data->start_class);
3686                 if (compat) {
3687                     ANYOF_BITMAP_SET(data->start_class, uc);
3688                     CLEAR_SSC_EOS(data->start_class);
3689                     if (OP(scan) == EXACTFL) {
3690                         /* XXX This set is probably no longer necessary, and
3691                          * probably wrong as LOCALE now is on in the initial
3692                          * state */
3693                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3694                     }
3695                     else {
3696
3697                         /* Also set the other member of the fold pair.  In case
3698                          * that unicode semantics is called for at runtime, use
3699                          * the full latin1 fold.  (Can't do this for locale,
3700                          * because not known until runtime) */
3701                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3702
3703                         /* All other (EXACTFL handled above) folds except under
3704                          * /iaa that include s, S, and sharp_s also may include
3705                          * the others */
3706                         if (OP(scan) != EXACTFA) {
3707                             if (uc == 's' || uc == 'S') {
3708                                 ANYOF_BITMAP_SET(data->start_class,
3709                                                  LATIN_SMALL_LETTER_SHARP_S);
3710                             }
3711                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3712                                 ANYOF_BITMAP_SET(data->start_class, 's');
3713                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3714                             }
3715                         }
3716                     }
3717                 }
3718                 else if (uc >= 0x100) {
3719                     int i;
3720                     for (i = 0; i < 256; i++){
3721                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3722                             ANYOF_BITMAP_SET(data->start_class, i);
3723                         }
3724                     }
3725                 }
3726             }
3727             else if (flags & SCF_DO_STCLASS_OR) {
3728                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3729                     /* false positive possible if the class is case-folded.
3730                        Assume that the locale settings are the same... */
3731                     if (uc < 0x100) {
3732                         ANYOF_BITMAP_SET(data->start_class, uc);
3733                         if (OP(scan) != EXACTFL) {
3734
3735                             /* And set the other member of the fold pair, but
3736                              * can't do that in locale because not known until
3737                              * run-time */
3738                             ANYOF_BITMAP_SET(data->start_class,
3739                                              PL_fold_latin1[uc]);
3740
3741                             /* All folds except under /iaa that include s, S,
3742                              * and sharp_s also may include the others */
3743                             if (OP(scan) != EXACTFA) {
3744                                 if (uc == 's' || uc == 'S') {
3745                                     ANYOF_BITMAP_SET(data->start_class,
3746                                                    LATIN_SMALL_LETTER_SHARP_S);
3747                                 }
3748                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3749                                     ANYOF_BITMAP_SET(data->start_class, 's');
3750                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3751                                 }
3752                             }
3753                         }
3754                     }
3755                     CLEAR_SSC_EOS(data->start_class);
3756                 }
3757                 cl_and(data->start_class, and_withp);
3758             }
3759             flags &= ~SCF_DO_STCLASS;
3760         }
3761         else if (REGNODE_VARIES(OP(scan))) {
3762             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3763             I32 f = flags, pos_before = 0;
3764             regnode * const oscan = scan;
3765             struct regnode_charclass_class this_class;
3766             struct regnode_charclass_class *oclass = NULL;
3767             I32 next_is_eval = 0;
3768
3769             switch (PL_regkind[OP(scan)]) {
3770             case WHILEM:                /* End of (?:...)* . */
3771                 scan = NEXTOPER(scan);
3772                 goto finish;
3773             case PLUS:
3774                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3775                     next = NEXTOPER(scan);
3776                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3777                         mincount = 1;
3778                         maxcount = REG_INFTY;
3779                         next = regnext(scan);
3780                         scan = NEXTOPER(scan);
3781                         goto do_curly;
3782                     }
3783                 }
3784                 if (flags & SCF_DO_SUBSTR)
3785                     data->pos_min++;
3786                 min++;
3787                 /* Fall through. */
3788             case STAR:
3789                 if (flags & SCF_DO_STCLASS) {
3790                     mincount = 0;
3791                     maxcount = REG_INFTY;
3792                     next = regnext(scan);
3793                     scan = NEXTOPER(scan);
3794                     goto do_curly;
3795                 }
3796                 is_inf = is_inf_internal = 1;
3797                 scan = regnext(scan);
3798                 if (flags & SCF_DO_SUBSTR) {
3799                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3800                     data->longest = &(data->longest_float);
3801                 }
3802                 goto optimize_curly_tail;
3803             case CURLY:
3804                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3805                     && (scan->flags == stopparen))
3806                 {
3807                     mincount = 1;
3808                     maxcount = 1;
3809                 } else {
3810                     mincount = ARG1(scan);
3811                     maxcount = ARG2(scan);
3812                 }
3813                 next = regnext(scan);
3814                 if (OP(scan) == CURLYX) {
3815                     I32 lp = (data ? *(data->last_closep) : 0);
3816                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3817                 }
3818                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3819                 next_is_eval = (OP(scan) == EVAL);
3820               do_curly:
3821                 if (flags & SCF_DO_SUBSTR) {
3822                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3823                     pos_before = data->pos_min;
3824                 }
3825                 if (data) {
3826                     fl = data->flags;
3827                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3828                     if (is_inf)
3829                         data->flags |= SF_IS_INF;
3830                 }
3831                 if (flags & SCF_DO_STCLASS) {
3832                     cl_init(pRExC_state, &this_class);
3833                     oclass = data->start_class;
3834                     data->start_class = &this_class;
3835                     f |= SCF_DO_STCLASS_AND;
3836                     f &= ~SCF_DO_STCLASS_OR;
3837                 }
3838                 /* Exclude from super-linear cache processing any {n,m}
3839                    regops for which the combination of input pos and regex
3840                    pos is not enough information to determine if a match
3841                    will be possible.
3842
3843                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3844                    regex pos at the \s*, the prospects for a match depend not
3845                    only on the input position but also on how many (bar\s*)
3846                    repeats into the {4,8} we are. */
3847                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3848                     f &= ~SCF_WHILEM_VISITED_POS;
3849
3850                 /* This will finish on WHILEM, setting scan, or on NULL: */
3851                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3852                                       last, data, stopparen, recursed, NULL,
3853                                       (mincount == 0
3854                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3855
3856                 if (flags & SCF_DO_STCLASS)
3857                     data->start_class = oclass;
3858                 if (mincount == 0 || minnext == 0) {
3859                     if (flags & SCF_DO_STCLASS_OR) {
3860                         cl_or(pRExC_state, data->start_class, &this_class);
3861                     }
3862                     else if (flags & SCF_DO_STCLASS_AND) {
3863                         /* Switch to OR mode: cache the old value of
3864                          * data->start_class */
3865                         INIT_AND_WITHP;
3866                         StructCopy(data->start_class, and_withp,
3867                                    struct regnode_charclass_class);
3868                         flags &= ~SCF_DO_STCLASS_AND;
3869                         StructCopy(&this_class, data->start_class,
3870                                    struct regnode_charclass_class);
3871                         flags |= SCF_DO_STCLASS_OR;
3872                         SET_SSC_EOS(data->start_class);
3873                     }
3874                 } else {                /* Non-zero len */
3875                     if (flags & SCF_DO_STCLASS_OR) {
3876                         cl_or(pRExC_state, data->start_class, &this_class);
3877                         cl_and(data->start_class, and_withp);
3878                     }
3879                     else if (flags & SCF_DO_STCLASS_AND)
3880                         cl_and(data->start_class, &this_class);
3881                     flags &= ~SCF_DO_STCLASS;
3882                 }
3883                 if (!scan)              /* It was not CURLYX, but CURLY. */
3884                     scan = next;
3885                 if ( /* ? quantifier ok, except for (?{ ... }) */
3886                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3887                     && (minnext == 0) && (deltanext == 0)
3888                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3889                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3890                 {
3891                     /* Fatal warnings may leak the regexp without this: */
3892                     SAVEFREESV(RExC_rx_sv);
3893                     ckWARNreg(RExC_parse,
3894                               "Quantifier unexpected on zero-length expression");
3895                     (void)ReREFCNT_inc(RExC_rx_sv);
3896                 }
3897
3898                 min += minnext * mincount;
3899                 is_inf_internal |= deltanext == I32_MAX
3900                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
3901                 is_inf |= is_inf_internal;
3902                 if (is_inf)
3903                     delta = I32_MAX;
3904                 else
3905                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
3906
3907                 /* Try powerful optimization CURLYX => CURLYN. */
3908                 if (  OP(oscan) == CURLYX && data
3909                       && data->flags & SF_IN_PAR
3910                       && !(data->flags & SF_HAS_EVAL)
3911                       && !deltanext && minnext == 1 ) {
3912                     /* Try to optimize to CURLYN.  */
3913                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3914                     regnode * const nxt1 = nxt;
3915 #ifdef DEBUGGING
3916                     regnode *nxt2;
3917 #endif
3918
3919                     /* Skip open. */
3920                     nxt = regnext(nxt);
3921                     if (!REGNODE_SIMPLE(OP(nxt))
3922                         && !(PL_regkind[OP(nxt)] == EXACT
3923                              && STR_LEN(nxt) == 1))
3924                         goto nogo;
3925 #ifdef DEBUGGING
3926                     nxt2 = nxt;
3927 #endif
3928                     nxt = regnext(nxt);
3929                     if (OP(nxt) != CLOSE)
3930                         goto nogo;
3931                     if (RExC_open_parens) {
3932                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3933                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3934                     }
3935                     /* Now we know that nxt2 is the only contents: */
3936                     oscan->flags = (U8)ARG(nxt);
3937                     OP(oscan) = CURLYN;
3938                     OP(nxt1) = NOTHING; /* was OPEN. */
3939
3940 #ifdef DEBUGGING
3941                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3942                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3943                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3944                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3945                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3946                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3947 #endif
3948                 }
3949               nogo:
3950
3951                 /* Try optimization CURLYX => CURLYM. */
3952                 if (  OP(oscan) == CURLYX && data
3953                       && !(data->flags & SF_HAS_PAR)
3954                       && !(data->flags & SF_HAS_EVAL)
3955                       && !deltanext     /* atom is fixed width */
3956                       && minnext != 0   /* CURLYM can't handle zero width */
3957                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3958                 ) {
3959                     /* XXXX How to optimize if data == 0? */
3960                     /* Optimize to a simpler form.  */
3961                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3962                     regnode *nxt2;
3963
3964                     OP(oscan) = CURLYM;
3965                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3966                             && (OP(nxt2) != WHILEM))
3967                         nxt = nxt2;
3968                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3969                     /* Need to optimize away parenths. */
3970                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3971                         /* Set the parenth number.  */
3972                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3973
3974                         oscan->flags = (U8)ARG(nxt);
3975                         if (RExC_open_parens) {
3976                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3977                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3978                         }
3979                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3980                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3981
3982 #ifdef DEBUGGING
3983                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3984                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3985                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3986                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3987 #endif
3988 #if 0
3989                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3990                             regnode *nnxt = regnext(nxt1);
3991                             if (nnxt == nxt) {
3992                                 if (reg_off_by_arg[OP(nxt1)])
3993                                     ARG_SET(nxt1, nxt2 - nxt1);
3994                                 else if (nxt2 - nxt1 < U16_MAX)
3995                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3996                                 else
3997                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3998                             }
3999                             nxt1 = nnxt;
4000                         }
4001 #endif
4002                         /* Optimize again: */
4003                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4004                                     NULL, stopparen, recursed, NULL, 0,depth+1);
4005                     }
4006                     else
4007                         oscan->flags = 0;
4008                 }
4009                 else if ((OP(oscan) == CURLYX)
4010                          && (flags & SCF_WHILEM_VISITED_POS)
4011                          /* See the comment on a similar expression above.
4012                             However, this time it's not a subexpression
4013                             we care about, but the expression itself. */
4014                          && (maxcount == REG_INFTY)
4015                          && data && ++data->whilem_c < 16) {
4016                     /* This stays as CURLYX, we can put the count/of pair. */
4017                     /* Find WHILEM (as in regexec.c) */
4018                     regnode *nxt = oscan + NEXT_OFF(oscan);
4019
4020                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4021                         nxt += ARG(nxt);
4022                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4023                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4024                 }
4025                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4026                     pars++;
4027                 if (flags & SCF_DO_SUBSTR) {
4028                     SV *last_str = NULL;
4029                     int counted = mincount != 0;
4030
4031                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4032 #if defined(SPARC64_GCC_WORKAROUND)
4033                         I32 b = 0;
4034                         STRLEN l = 0;
4035                         const char *s = NULL;
4036                         I32 old = 0;
4037
4038                         if (pos_before >= data->last_start_min)
4039                             b = pos_before;
4040                         else
4041                             b = data->last_start_min;
4042
4043                         l = 0;
4044                         s = SvPV_const(data->last_found, l);
4045                         old = b - data->last_start_min;
4046
4047 #else
4048                         I32 b = pos_before >= data->last_start_min
4049                             ? pos_before : data->last_start_min;
4050                         STRLEN l;
4051                         const char * const s = SvPV_const(data->last_found, l);
4052                         I32 old = b - data->last_start_min;
4053 #endif
4054
4055                         if (UTF)
4056                             old = utf8_hop((U8*)s, old) - (U8*)s;
4057                         l -= old;
4058                         /* Get the added string: */
4059                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4060                         if (deltanext == 0 && pos_before == b) {
4061                             /* What was added is a constant string */
4062                             if (mincount > 1) {
4063                                 SvGROW(last_str, (mincount * l) + 1);
4064                                 repeatcpy(SvPVX(last_str) + l,
4065                                           SvPVX_const(last_str), l, mincount - 1);
4066                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4067                                 /* Add additional parts. */
4068                                 SvCUR_set(data->last_found,
4069                                           SvCUR(data->last_found) - l);
4070                                 sv_catsv(data->last_found, last_str);
4071                                 {
4072                                     SV * sv = data->last_found;
4073                                     MAGIC *mg =
4074                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4075                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4076                                     if (mg && mg->mg_len >= 0)
4077                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4078                                 }
4079                                 data->last_end += l * (mincount - 1);
4080                             }
4081                         } else {
4082                             /* start offset must point into the last copy */
4083                             data->last_start_min += minnext * (mincount - 1);
4084                             data->last_start_max += is_inf ? I32_MAX
4085                                 : (maxcount - 1) * (minnext + data->pos_delta);
4086                         }
4087                     }
4088                     /* It is counted once already... */
4089                     data->pos_min += minnext * (mincount - counted);
4090 #if 0
4091 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4092     counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4093 if (deltanext != I32_MAX)
4094 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4095 #endif
4096                     if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4097                         data->pos_delta = I32_MAX;
4098                     else
4099                         data->pos_delta += - counted * deltanext +
4100                         (minnext + deltanext) * maxcount - minnext * mincount;
4101                     if (mincount != maxcount) {
4102                          /* Cannot extend fixed substrings found inside
4103                             the group.  */
4104                         SCAN_COMMIT(pRExC_state,data,minlenp);
4105                         if (mincount && last_str) {
4106                             SV * const sv = data->last_found;
4107                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4108                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4109
4110                             if (mg)
4111                                 mg->mg_len = -1;
4112                             sv_setsv(sv, last_str);
4113                             data->last_end = data->pos_min;
4114                             data->last_start_min =
4115                                 data->pos_min - CHR_SVLEN(last_str);
4116                             data->last_start_max = is_inf
4117                                 ? I32_MAX
4118                                 : data->pos_min + data->pos_delta
4119                                 - CHR_SVLEN(last_str);
4120                         }
4121                         data->longest = &(data->longest_float);
4122                     }
4123                     SvREFCNT_dec(last_str);
4124                 }
4125                 if (data && (fl & SF_HAS_EVAL))
4126                     data->flags |= SF_HAS_EVAL;
4127               optimize_curly_tail:
4128                 if (OP(oscan) != CURLYX) {
4129                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4130                            && NEXT_OFF(next))
4131                         NEXT_OFF(oscan) += NEXT_OFF(next);
4132                 }
4133                 continue;
4134             default:                    /* REF, and CLUMP only? */
4135                 if (flags & SCF_DO_SUBSTR) {
4136                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4137                     data->longest = &(data->longest_float);
4138                 }
4139                 is_inf = is_inf_internal = 1;
4140                 if (flags & SCF_DO_STCLASS_OR)
4141                     cl_anything(pRExC_state, data->start_class);
4142                 flags &= ~SCF_DO_STCLASS;
4143                 break;
4144             }
4145         }
4146         else if (OP(scan) == LNBREAK) {
4147             if (flags & SCF_DO_STCLASS) {
4148                 int value = 0;
4149                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4150                 if (flags & SCF_DO_STCLASS_AND) {
4151                     for (value = 0; value < 256; value++)
4152                         if (!is_VERTWS_cp(value))
4153                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4154                 }
4155                 else {
4156                     for (value = 0; value < 256; value++)
4157                         if (is_VERTWS_cp(value))
4158                             ANYOF_BITMAP_SET(data->start_class, value);
4159                 }
4160                 if (flags & SCF_DO_STCLASS_OR)
4161                     cl_and(data->start_class, and_withp);
4162                 flags &= ~SCF_DO_STCLASS;
4163             }
4164             min++;
4165             delta++;    /* Because of the 2 char string cr-lf */
4166             if (flags & SCF_DO_SUBSTR) {
4167                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4168                 data->pos_min += 1;
4169                 data->pos_delta += 1;
4170                 data->longest = &(data->longest_float);
4171             }
4172         }
4173         else if (REGNODE_SIMPLE(OP(scan))) {
4174             int value = 0;
4175
4176             if (flags & SCF_DO_SUBSTR) {
4177                 SCAN_COMMIT(pRExC_state,data,minlenp);
4178                 data->pos_min++;
4179             }
4180             min++;
4181             if (flags & SCF_DO_STCLASS) {
4182                 int loop_max = 256;
4183                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4184
4185                 /* Some of the logic below assumes that switching
4186                    locale on will only add false positives. */
4187                 switch (PL_regkind[OP(scan)]) {
4188                     U8 classnum;
4189
4190                 case SANY:
4191                 default:
4192 #ifdef DEBUGGING
4193                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4194 #endif
4195                  do_default:
4196                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4197                         cl_anything(pRExC_state, data->start_class);
4198                     break;
4199                 case REG_ANY:
4200                     if (OP(scan) == SANY)
4201                         goto do_default;
4202                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4203                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4204                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4205                         cl_anything(pRExC_state, data->start_class);
4206                     }
4207                     if (flags & SCF_DO_STCLASS_AND || !value)
4208                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4209                     break;
4210                 case ANYOF:
4211                     if (flags & SCF_DO_STCLASS_AND)
4212                         cl_and(data->start_class,
4213                                (struct regnode_charclass_class*)scan);
4214                     else
4215                         cl_or(pRExC_state, data->start_class,
4216                               (struct regnode_charclass_class*)scan);
4217                     break;
4218                 case POSIXA:
4219                     loop_max = 128;
4220                     /* FALL THROUGH */
4221                 case POSIXL:
4222                 case POSIXD:
4223                 case POSIXU:
4224                     classnum = FLAGS(scan);
4225                     if (flags & SCF_DO_STCLASS_AND) {
4226                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4227                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4228                             for (value = 0; value < loop_max; value++) {
4229                                 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4230                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4231                                 }
4232                             }
4233                         }
4234                     }
4235                     else {
4236                         if (data->start_class->flags & ANYOF_LOCALE) {
4237                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4238                         }
4239                         else {
4240
4241                         /* Even if under locale, set the bits for non-locale
4242                          * in case it isn't a true locale-node.  This will
4243                          * create false positives if it truly is locale */
4244                         for (value = 0; value < loop_max; value++) {
4245                             if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4246                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4247                             }
4248                         }
4249                         }
4250                     }
4251                     break;
4252                 case NPOSIXA:
4253                     loop_max = 128;
4254                     /* FALL THROUGH */
4255                 case NPOSIXL:
4256                 case NPOSIXU:
4257                 case NPOSIXD:
4258                     classnum = FLAGS(scan);
4259                     if (flags & SCF_DO_STCLASS_AND) {
4260                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4261                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4262                             for (value = 0; value < loop_max; value++) {
4263                                 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4264                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4265                                 }
4266                             }
4267                         }
4268                     }
4269                     else {
4270                         if (data->start_class->flags & ANYOF_LOCALE) {
4271                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4272                         }
4273                         else {
4274
4275                         /* Even if under locale, set the bits for non-locale in
4276                          * case it isn't a true locale-node.  This will create
4277                          * false positives if it truly is locale */
4278                         for (value = 0; value < loop_max; value++) {
4279                             if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4280                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4281                             }
4282                         }
4283                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4284                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4285                         }
4286                         }
4287                     }
4288                     break;
4289                 }
4290                 if (flags & SCF_DO_STCLASS_OR)
4291                     cl_and(data->start_class, and_withp);
4292                 flags &= ~SCF_DO_STCLASS;
4293             }
4294         }
4295         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4296             data->flags |= (OP(scan) == MEOL
4297                             ? SF_BEFORE_MEOL
4298                             : SF_BEFORE_SEOL);
4299             SCAN_COMMIT(pRExC_state, data, minlenp);
4300
4301         }
4302         else if (  PL_regkind[OP(scan)] == BRANCHJ
4303                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4304                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4305                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4306             if ( OP(scan) == UNLESSM &&
4307                  scan->flags == 0 &&
4308                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4309                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4310             ) {
4311                 regnode *opt;
4312                 regnode *upto= regnext(scan);
4313                 DEBUG_PARSE_r({
4314                     SV * const mysv_val=sv_newmortal();
4315                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4316
4317                     /*DEBUG_PARSE_MSG("opfail");*/
4318                     regprop(RExC_rx, mysv_val, upto);
4319                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4320                                   SvPV_nolen_const(mysv_val),
4321                                   (IV)REG_NODE_NUM(upto),
4322                                   (IV)(upto - scan)
4323                     );
4324                 });
4325                 OP(scan) = OPFAIL;
4326                 NEXT_OFF(scan) = upto - scan;
4327                 for (opt= scan + 1; opt < upto ; opt++)
4328                     OP(opt) = OPTIMIZED;
4329                 scan= upto;
4330                 continue;
4331             }
4332             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4333                 || OP(scan) == UNLESSM )
4334             {
4335                 /* Negative Lookahead/lookbehind
4336                    In this case we can't do fixed string optimisation.
4337                 */
4338
4339                 I32 deltanext, minnext, fake = 0;
4340                 regnode *nscan;
4341                 struct regnode_charclass_class intrnl;
4342                 int f = 0;
4343
4344                 data_fake.flags = 0;
4345                 if (data) {
4346                     data_fake.whilem_c = data->whilem_c;
4347                     data_fake.last_closep = data->last_closep;
4348                 }
4349                 else
4350                     data_fake.last_closep = &fake;
4351                 data_fake.pos_delta = delta;
4352                 if ( flags & SCF_DO_STCLASS && !scan->flags
4353                      && OP(scan) == IFMATCH ) { /* Lookahead */
4354                     cl_init(pRExC_state, &intrnl);
4355                     data_fake.start_class = &intrnl;
4356                     f |= SCF_DO_STCLASS_AND;
4357                 }
4358                 if (flags & SCF_WHILEM_VISITED_POS)
4359                     f |= SCF_WHILEM_VISITED_POS;
4360                 next = regnext(scan);
4361                 nscan = NEXTOPER(NEXTOPER(scan));
4362                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4363                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4364                 if (scan->flags) {
4365                     if (deltanext) {
4366                         FAIL("Variable length lookbehind not implemented");
4367                     }
4368                     else if (minnext > (I32)U8_MAX) {
4369                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4370                     }
4371                     scan->flags = (U8)minnext;
4372                 }
4373                 if (data) {
4374                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4375                         pars++;
4376                     if (data_fake.flags & SF_HAS_EVAL)
4377                         data->flags |= SF_HAS_EVAL;
4378                     data->whilem_c = data_fake.whilem_c;
4379                 }
4380                 if (f & SCF_DO_STCLASS_AND) {
4381                     if (flags & SCF_DO_STCLASS_OR) {
4382                         /* OR before, AND after: ideally we would recurse with
4383                          * data_fake to get the AND applied by study of the
4384                          * remainder of the pattern, and then derecurse;
4385                          * *** HACK *** for now just treat as "no information".
4386                          * See [perl #56690].
4387                          */
4388                         cl_init(pRExC_state, data->start_class);
4389                     }  else {
4390                         /* AND before and after: combine and continue */
4391                         const int was = TEST_SSC_EOS(data->start_class);
4392
4393                         cl_and(data->start_class, &intrnl);
4394                         if (was)
4395                             SET_SSC_EOS(data->start_class);
4396                     }
4397                 }
4398             }
4399 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4400             else {
4401                 /* Positive Lookahead/lookbehind
4402                    In this case we can do fixed string optimisation,
4403                    but we must be careful about it. Note in the case of
4404                    lookbehind the positions will be offset by the minimum
4405                    length of the pattern, something we won't know about
4406                    until after the recurse.
4407                 */
4408                 I32 deltanext, fake = 0;
4409                 regnode *nscan;
4410                 struct regnode_charclass_class intrnl;
4411                 int f = 0;
4412                 /* We use SAVEFREEPV so that when the full compile
4413                     is finished perl will clean up the allocated
4414                     minlens when it's all done. This way we don't
4415                     have to worry about freeing them when we know
4416                     they wont be used, which would be a pain.
4417                  */
4418                 I32 *minnextp;
4419                 Newx( minnextp, 1, I32 );
4420                 SAVEFREEPV(minnextp);
4421
4422                 if (data) {
4423                     StructCopy(data, &data_fake, scan_data_t);
4424                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4425                         f |= SCF_DO_SUBSTR;
4426                         if (scan->flags)
4427                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4428                         data_fake.last_found=newSVsv(data->last_found);
4429                     }
4430                 }
4431                 else
4432                     data_fake.last_closep = &fake;
4433                 data_fake.flags = 0;
4434                 data_fake.pos_delta = delta;
4435                 if (is_inf)
4436                     data_fake.flags |= SF_IS_INF;
4437                 if ( flags & SCF_DO_STCLASS && !scan->flags
4438                      && OP(scan) == IFMATCH ) { /* Lookahead */
4439                     cl_init(pRExC_state, &intrnl);
4440                     data_fake.start_class = &intrnl;
4441                     f |= SCF_DO_STCLASS_AND;
4442                 }
4443                 if (flags & SCF_WHILEM_VISITED_POS)
4444                     f |= SCF_WHILEM_VISITED_POS;
4445                 next = regnext(scan);
4446                 nscan = NEXTOPER(NEXTOPER(scan));
4447
4448                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4449                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4450                 if (scan->flags) {
4451                     if (deltanext) {
4452                         FAIL("Variable length lookbehind not implemented");
4453                     }
4454                     else if (*minnextp > (I32)U8_MAX) {
4455                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4456                     }
4457                     scan->flags = (U8)*minnextp;
4458                 }
4459
4460                 *minnextp += min;
4461
4462                 if (f & SCF_DO_STCLASS_AND) {
4463                     const int was = TEST_SSC_EOS(data.start_class);
4464
4465                     cl_and(data->start_class, &intrnl);
4466                     if (was)
4467                         SET_SSC_EOS(data->start_class);
4468                 }
4469                 if (data) {
4470                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4471                         pars++;
4472                     if (data_fake.flags & SF_HAS_EVAL)
4473                         data->flags |= SF_HAS_EVAL;
4474                     data->whilem_c = data_fake.whilem_c;
4475                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4476                         if (RExC_rx->minlen<*minnextp)
4477                             RExC_rx->minlen=*minnextp;
4478                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4479                         SvREFCNT_dec_NN(data_fake.last_found);
4480
4481                         if ( data_fake.minlen_fixed != minlenp )
4482                         {
4483                             data->offset_fixed= data_fake.offset_fixed;
4484                             data->minlen_fixed= data_fake.minlen_fixed;
4485                             data->lookbehind_fixed+= scan->flags;
4486                         }
4487                         if ( data_fake.minlen_float != minlenp )
4488                         {
4489                             data->minlen_float= data_fake.minlen_float;
4490                             data->offset_float_min=data_fake.offset_float_min;
4491                             data->offset_float_max=data_fake.offset_float_max;
4492                             data->lookbehind_float+= scan->flags;
4493                         }
4494                     }
4495                 }
4496             }
4497 #endif
4498         }
4499         else if (OP(scan) == OPEN) {
4500             if (stopparen != (I32)ARG(scan))
4501                 pars++;
4502         }
4503         else if (OP(scan) == CLOSE) {
4504             if (stopparen == (I32)ARG(scan)) {
4505                 break;
4506             }
4507             if ((I32)ARG(scan) == is_par) {
4508                 next = regnext(scan);
4509
4510                 if ( next && (OP(next) != WHILEM) && next < last)
4511                     is_par = 0;         /* Disable optimization */
4512             }
4513             if (data)
4514                 *(data->last_closep) = ARG(scan);
4515         }
4516         else if (OP(scan) == EVAL) {
4517                 if (data)
4518                     data->flags |= SF_HAS_EVAL;
4519         }
4520         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4521             if (flags & SCF_DO_SUBSTR) {
4522                 SCAN_COMMIT(pRExC_state,data,minlenp);
4523                 flags &= ~SCF_DO_SUBSTR;
4524             }
4525             if (data && OP(scan)==ACCEPT) {
4526                 data->flags |= SCF_SEEN_ACCEPT;
4527                 if (stopmin > min)
4528                     stopmin = min;
4529             }
4530         }
4531         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4532         {
4533                 if (flags & SCF_DO_SUBSTR) {
4534                     SCAN_COMMIT(pRExC_state,data,minlenp);
4535                     data->longest = &(data->longest_float);
4536                 }
4537                 is_inf = is_inf_internal = 1;
4538                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4539                     cl_anything(pRExC_state, data->start_class);
4540                 flags &= ~SCF_DO_STCLASS;
4541         }
4542         else if (OP(scan) == GPOS) {
4543             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4544                 !(delta || is_inf || (data && data->pos_delta)))
4545             {
4546                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4547                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4548                 if (RExC_rx->gofs < (U32)min)
4549                     RExC_rx->gofs = min;
4550             } else {
4551                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4552                 RExC_rx->gofs = 0;
4553             }
4554         }
4555 #ifdef TRIE_STUDY_OPT
4556 #ifdef FULL_TRIE_STUDY
4557         else if (PL_regkind[OP(scan)] == TRIE) {
4558             /* NOTE - There is similar code to this block above for handling
4559                BRANCH nodes on the initial study.  If you change stuff here
4560                check there too. */
4561             regnode *trie_node= scan;
4562             regnode *tail= regnext(scan);
4563             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4564             I32 max1 = 0, min1 = I32_MAX;
4565             struct regnode_charclass_class accum;
4566
4567             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4568                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4569             if (flags & SCF_DO_STCLASS)
4570                 cl_init_zero(pRExC_state, &accum);
4571
4572             if (!trie->jump) {
4573                 min1= trie->minlen;
4574                 max1= trie->maxlen;
4575             } else {
4576                 const regnode *nextbranch= NULL;
4577                 U32 word;
4578
4579                 for ( word=1 ; word <= trie->wordcount ; word++)
4580                 {
4581                     I32 deltanext=0, minnext=0, f = 0, fake;
4582                     struct regnode_charclass_class this_class;
4583
4584                     data_fake.flags = 0;
4585                     if (data) {
4586                         data_fake.whilem_c = data->whilem_c;
4587                         data_fake.last_closep = data->last_closep;
4588                     }
4589                     else
4590                         data_fake.last_closep = &fake;
4591                     data_fake.pos_delta = delta;
4592                     if (flags & SCF_DO_STCLASS) {
4593                         cl_init(pRExC_state, &this_class);
4594                         data_fake.start_class = &this_class;
4595                         f = SCF_DO_STCLASS_AND;
4596                     }
4597                     if (flags & SCF_WHILEM_VISITED_POS)
4598                         f |= SCF_WHILEM_VISITED_POS;
4599
4600                     if (trie->jump[word]) {
4601                         if (!nextbranch)
4602                             nextbranch = trie_node + trie->jump[0];
4603                         scan= trie_node + trie->jump[word];
4604                         /* We go from the jump point to the branch that follows
4605                            it. Note this means we need the vestigal unused branches
4606                            even though they arent otherwise used.
4607                          */
4608                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4609                             &deltanext, (regnode *)nextbranch, &data_fake,
4610                             stopparen, recursed, NULL, f,depth+1);
4611                     }
4612                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4613                         nextbranch= regnext((regnode*)nextbranch);
4614
4615                     if (min1 > (I32)(minnext + trie->minlen))
4616                         min1 = minnext + trie->minlen;
4617                     if (deltanext == I32_MAX) {
4618                         is_inf = is_inf_internal = 1;
4619                         max1 = I32_MAX;
4620                     } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4621                         max1 = minnext + deltanext + trie->maxlen;
4622
4623                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4624                         pars++;
4625                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4626                         if ( stopmin > min + min1)
4627                             stopmin = min + min1;
4628                         flags &= ~SCF_DO_SUBSTR;
4629                         if (data)
4630                             data->flags |= SCF_SEEN_ACCEPT;
4631                     }
4632                     if (data) {
4633                         if (data_fake.flags & SF_HAS_EVAL)
4634                             data->flags |= SF_HAS_EVAL;
4635                         data->whilem_c = data_fake.whilem_c;
4636                     }
4637                     if (flags & SCF_DO_STCLASS)
4638                         cl_or(pRExC_state, &accum, &this_class);
4639                 }
4640             }
4641             if (flags & SCF_DO_SUBSTR) {
4642                 data->pos_min += min1;
4643                 data->pos_delta += max1 - min1;
4644                 if (max1 != min1 || is_inf)
4645                     data->longest = &(data->longest_float);
4646             }
4647             min += min1;
4648             delta += max1 - min1;
4649             if (flags & SCF_DO_STCLASS_OR) {
4650                 cl_or(pRExC_state, data->start_class, &accum);
4651                 if (min1) {
4652                     cl_and(data->start_class, and_withp);
4653                     flags &= ~SCF_DO_STCLASS;
4654                 }
4655             }
4656             else if (flags & SCF_DO_STCLASS_AND) {
4657                 if (min1) {
4658                     cl_and(data->start_class, &accum);
4659                     flags &= ~SCF_DO_STCLASS;
4660                 }
4661                 else {
4662                     /* Switch to OR mode: cache the old value of
4663                      * data->start_class */
4664                     INIT_AND_WITHP;
4665                     StructCopy(data->start_class, and_withp,
4666                                struct regnode_charclass_class);
4667                     flags &= ~SCF_DO_STCLASS_AND;
4668                     StructCopy(&accum, data->start_class,
4669                                struct regnode_charclass_class);
4670                     flags |= SCF_DO_STCLASS_OR;
4671                     SET_SSC_EOS(data->start_class);
4672                 }
4673             }
4674             scan= tail;
4675             continue;
4676         }
4677 #else
4678         else if (PL_regkind[OP(scan)] == TRIE) {
4679             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4680             U8*bang=NULL;
4681
4682             min += trie->minlen;
4683             delta += (trie->maxlen - trie->minlen);
4684             flags &= ~SCF_DO_STCLASS; /* xxx */
4685             if (flags & SCF_DO_SUBSTR) {
4686                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4687                 data->pos_min += trie->minlen;
4688                 data->pos_delta += (trie->maxlen - trie->minlen);
4689                 if (trie->maxlen != trie->minlen)
4690                     data->longest = &(data->longest_float);
4691             }
4692             if (trie->jump) /* no more substrings -- for now /grr*/
4693                 flags &= ~SCF_DO_SUBSTR;
4694         }
4695 #endif /* old or new */
4696 #endif /* TRIE_STUDY_OPT */
4697
4698         /* Else: zero-length, ignore. */
4699         scan = regnext(scan);
4700     }
4701     if (frame) {
4702         last = frame->last;
4703         scan = frame->next;
4704         stopparen = frame->stop;
4705         frame = frame->prev;
4706         goto fake_study_recurse;
4707     }
4708
4709   finish:
4710     assert(!frame);
4711     DEBUG_STUDYDATA("pre-fin:",data,depth);
4712
4713     *scanp = scan;
4714     *deltap = is_inf_internal ? I32_MAX : delta;
4715     if (flags & SCF_DO_SUBSTR && is_inf)
4716         data->pos_delta = I32_MAX - data->pos_min;
4717     if (is_par > (I32)U8_MAX)
4718         is_par = 0;
4719     if (is_par && pars==1 && data) {
4720         data->flags |= SF_IN_PAR;
4721         data->flags &= ~SF_HAS_PAR;
4722     }
4723     else if (pars && data) {
4724         data->flags |= SF_HAS_PAR;
4725         data->flags &= ~SF_IN_PAR;
4726     }
4727     if (flags & SCF_DO_STCLASS_OR)
4728         cl_and(data->start_class, and_withp);
4729     if (flags & SCF_TRIE_RESTUDY)
4730         data->flags |=  SCF_TRIE_RESTUDY;
4731
4732     DEBUG_STUDYDATA("post-fin:",data,depth);
4733
4734     return min < stopmin ? min : stopmin;
4735 }
4736
4737 STATIC U32
4738 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4739 {
4740     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4741
4742     PERL_ARGS_ASSERT_ADD_DATA;
4743
4744     Renewc(RExC_rxi->data,
4745            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4746            char, struct reg_data);
4747     if(count)
4748         Renew(RExC_rxi->data->what, count + n, U8);
4749     else
4750         Newx(RExC_rxi->data->what, n, U8);
4751     RExC_rxi->data->count = count + n;
4752     Copy(s, RExC_rxi->data->what + count, n, U8);
4753     return count;
4754 }
4755
4756 /*XXX: todo make this not included in a non debugging perl */
4757 #ifndef PERL_IN_XSUB_RE
4758 void
4759 Perl_reginitcolors(pTHX)
4760 {
4761     dVAR;
4762     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4763     if (s) {
4764         char *t = savepv(s);
4765         int i = 0;
4766         PL_colors[0] = t;
4767         while (++i < 6) {
4768             t = strchr(t, '\t');
4769             if (t) {
4770                 *t = '\0';
4771                 PL_colors[i] = ++t;
4772             }
4773             else
4774                 PL_colors[i] = t = (char *)"";
4775         }
4776     } else {
4777         int i = 0;
4778         while (i < 6)
4779             PL_colors[i++] = (char *)"";
4780     }
4781     PL_colorset = 1;
4782 }
4783 #endif
4784
4785
4786 #ifdef TRIE_STUDY_OPT
4787 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4788     STMT_START {                                            \
4789         if (                                                \
4790               (data.flags & SCF_TRIE_RESTUDY)               \
4791               && ! restudied++                              \
4792         ) {                                                 \
4793             dOsomething;                                    \
4794             goto reStudy;                                   \
4795         }                                                   \
4796     } STMT_END
4797 #else
4798 #define CHECK_RESTUDY_GOTO_butfirst
4799 #endif
4800
4801 /*
4802  * pregcomp - compile a regular expression into internal code
4803  *
4804  * Decides which engine's compiler to call based on the hint currently in
4805  * scope
4806  */
4807
4808 #ifndef PERL_IN_XSUB_RE
4809
4810 /* return the currently in-scope regex engine (or the default if none)  */
4811
4812 regexp_engine const *
4813 Perl_current_re_engine(pTHX)
4814 {
4815     dVAR;
4816
4817     if (IN_PERL_COMPILETIME) {
4818         HV * const table = GvHV(PL_hintgv);
4819         SV **ptr;
4820
4821         if (!table)
4822             return &reh_regexp_engine;
4823         ptr = hv_fetchs(table, "regcomp", FALSE);
4824         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4825             return &reh_regexp_engine;
4826         return INT2PTR(regexp_engine*,SvIV(*ptr));
4827     }
4828     else {
4829         SV *ptr;
4830         if (!PL_curcop->cop_hints_hash)
4831             return &reh_regexp_engine;
4832         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4833         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4834             return &reh_regexp_engine;
4835         return INT2PTR(regexp_engine*,SvIV(ptr));
4836     }
4837 }
4838
4839
4840 REGEXP *
4841 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4842 {
4843     dVAR;
4844     regexp_engine const *eng = current_re_engine();
4845     GET_RE_DEBUG_FLAGS_DECL;
4846
4847     PERL_ARGS_ASSERT_PREGCOMP;
4848
4849     /* Dispatch a request to compile a regexp to correct regexp engine. */
4850     DEBUG_COMPILE_r({
4851         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4852                         PTR2UV(eng));
4853     });
4854     return CALLREGCOMP_ENG(eng, pattern, flags);
4855 }
4856 #endif
4857
4858 /* public(ish) entry point for the perl core's own regex compiling code.
4859  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4860  * pattern rather than a list of OPs, and uses the internal engine rather
4861  * than the current one */
4862
4863 REGEXP *
4864 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4865 {
4866     SV *pat = pattern; /* defeat constness! */
4867     PERL_ARGS_ASSERT_RE_COMPILE;
4868     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4869 #ifdef PERL_IN_XSUB_RE
4870                                 &my_reg_engine,
4871 #else
4872                                 &reh_regexp_engine,
4873 #endif
4874                                 NULL, NULL, rx_flags, 0);
4875 }
4876
4877 /* see if there are any run-time code blocks in the pattern.
4878  * False positives are allowed */
4879
4880 static bool
4881 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4882                     char *pat, STRLEN plen)
4883 {
4884     int n = 0;
4885     STRLEN s;
4886
4887     for (s = 0; s < plen; s++) {
4888         if (n < pRExC_state->num_code_blocks
4889             && s == pRExC_state->code_blocks[n].start)
4890         {
4891             s = pRExC_state->code_blocks[n].end;
4892             n++;
4893             continue;
4894         }
4895         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4896          * positives here */
4897         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4898             (pat[s+2] == '{'
4899                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4900         )
4901             return 1;
4902     }
4903     return 0;
4904 }
4905
4906 /* Handle run-time code blocks. We will already have compiled any direct
4907  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4908  * copy of it, but with any literal code blocks blanked out and
4909  * appropriate chars escaped; then feed it into
4910  *
4911  *    eval "qr'modified_pattern'"
4912  *
4913  * For example,
4914  *
4915  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4916  *
4917  * becomes
4918  *
4919  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4920  *
4921  * After eval_sv()-ing that, grab any new code blocks from the returned qr
4922  * and merge them with any code blocks of the original regexp.
4923  *
4924  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4925  * instead, just save the qr and return FALSE; this tells our caller that
4926  * the original pattern needs upgrading to utf8.
4927  */
4928
4929 static bool
4930 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4931     char *pat, STRLEN plen)
4932 {
4933     SV *qr;
4934
4935     GET_RE_DEBUG_FLAGS_DECL;
4936
4937     if (pRExC_state->runtime_code_qr) {
4938         /* this is the second time we've been called; this should
4939          * only happen if the main pattern got upgraded to utf8
4940          * during compilation; re-use the qr we compiled first time
4941          * round (which should be utf8 too)
4942          */
4943         qr = pRExC_state->runtime_code_qr;
4944         pRExC_state->runtime_code_qr = NULL;
4945         assert(RExC_utf8 && SvUTF8(qr));
4946     }
4947     else {
4948         int n = 0;
4949         STRLEN s;
4950         char *p, *newpat;
4951         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4952         SV *sv, *qr_ref;
4953         dSP;
4954
4955         /* determine how many extra chars we need for ' and \ escaping */
4956         for (s = 0; s < plen; s++) {
4957             if (pat[s] == '\'' || pat[s] == '\\')
4958                 newlen++;
4959         }
4960
4961         Newx(newpat, newlen, char);
4962         p = newpat;
4963         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4964
4965         for (s = 0; s < plen; s++) {
4966             if (n < pRExC_state->num_code_blocks
4967                 && s == pRExC_state->code_blocks[n].start)
4968             {
4969                 /* blank out literal code block */
4970                 assert(pat[s] == '(');
4971                 while (s <= pRExC_state->code_blocks[n].end) {
4972                     *p++ = '_';
4973                     s++;
4974                 }
4975                 s--;
4976                 n++;
4977                 continue;
4978             }
4979             if (pat[s] == '\'' || pat[s] == '\\')
4980                 *p++ = '\\';
4981             *p++ = pat[s];
4982         }
4983         *p++ = '\'';
4984         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4985             *p++ = 'x';
4986         *p++ = '\0';
4987         DEBUG_COMPILE_r({
4988             PerlIO_printf(Perl_debug_log,
4989                 "%sre-parsing pattern for runtime code:%s %s\n",
4990                 PL_colors[4],PL_colors[5],newpat);
4991         });
4992
4993         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4994         Safefree(newpat);
4995
4996         ENTER;
4997         SAVETMPS;
4998         save_re_context();
4999         PUSHSTACKi(PERLSI_REQUIRE);
5000         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5001          * parsing qr''; normally only q'' does this. It also alters
5002          * hints handling */
5003         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5004         SvREFCNT_dec_NN(sv);
5005         SPAGAIN;
5006         qr_ref = POPs;
5007         PUTBACK;
5008         {
5009             SV * const errsv = ERRSV;
5010             if (SvTRUE_NN(errsv))
5011             {
5012                 Safefree(pRExC_state->code_blocks);
5013                 /* use croak_sv ? */
5014                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5015             }
5016         }
5017         assert(SvROK(qr_ref));
5018         qr = SvRV(qr_ref);
5019         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5020         /* the leaving below frees the tmp qr_ref.
5021          * Give qr a life of its own */
5022         SvREFCNT_inc(qr);
5023         POPSTACK;
5024         FREETMPS;
5025         LEAVE;
5026
5027     }
5028
5029     if (!RExC_utf8 && SvUTF8(qr)) {
5030         /* first time through; the pattern got upgraded; save the
5031          * qr for the next time through */
5032         assert(!pRExC_state->runtime_code_qr);
5033         pRExC_state->runtime_code_qr = qr;
5034         return 0;
5035     }
5036
5037
5038     /* extract any code blocks within the returned qr//  */
5039
5040
5041     /* merge the main (r1) and run-time (r2) code blocks into one */
5042     {
5043         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5044         struct reg_code_block *new_block, *dst;
5045         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5046         int i1 = 0, i2 = 0;
5047
5048         if (!r2->num_code_blocks) /* we guessed wrong */
5049         {
5050             SvREFCNT_dec_NN(qr);
5051             return 1;
5052         }
5053
5054         Newx(new_block,
5055             r1->num_code_blocks + r2->num_code_blocks,
5056             struct reg_code_block);
5057         dst = new_block;
5058
5059         while (    i1 < r1->num_code_blocks
5060                 || i2 < r2->num_code_blocks)
5061         {
5062             struct reg_code_block *src;
5063             bool is_qr = 0;
5064
5065             if (i1 == r1->num_code_blocks) {
5066                 src = &r2->code_blocks[i2++];
5067                 is_qr = 1;
5068             }
5069             else if (i2 == r2->num_code_blocks)
5070                 src = &r1->code_blocks[i1++];
5071             else if (  r1->code_blocks[i1].start
5072                      < r2->code_blocks[i2].start)
5073             {
5074                 src = &r1->code_blocks[i1++];
5075                 assert(src->end < r2->code_blocks[i2].start);
5076             }
5077             else {
5078                 assert(  r1->code_blocks[i1].start
5079                        > r2->code_blocks[i2].start);
5080                 src = &r2->code_blocks[i2++];
5081                 is_qr = 1;
5082                 assert(src->end < r1->code_blocks[i1].start);
5083             }
5084
5085             assert(pat[src->start] == '(');
5086             assert(pat[src->end]   == ')');
5087             dst->start      = src->start;
5088             dst->end        = src->end;
5089             dst->block      = src->block;
5090             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5091                                     : src->src_regex;
5092             dst++;
5093         }
5094         r1->num_code_blocks += r2->num_code_blocks;
5095         Safefree(r1->code_blocks);
5096         r1->code_blocks = new_block;
5097     }
5098
5099     SvREFCNT_dec_NN(qr);
5100     return 1;
5101 }
5102
5103
5104 STATIC bool
5105 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5106 {
5107     /* This is the common code for setting up the floating and fixed length
5108      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5109      * as to whether succeeded or not */
5110
5111     I32 t,ml;
5112
5113     if (! (longest_length
5114            || (eol /* Can't have SEOL and MULTI */
5115                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5116           )
5117             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5118         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5119     {
5120         return FALSE;
5121     }
5122
5123     /* copy the information about the longest from the reg_scan_data
5124         over to the program. */
5125     if (SvUTF8(sv_longest)) {
5126         *rx_utf8 = sv_longest;
5127         *rx_substr = NULL;
5128     } else {
5129         *rx_substr = sv_longest;
5130         *rx_utf8 = NULL;
5131     }
5132     /* end_shift is how many chars that must be matched that
5133         follow this item. We calculate it ahead of time as once the
5134         lookbehind offset is added in we lose the ability to correctly
5135         calculate it.*/
5136     ml = minlen ? *(minlen) : (I32)longest_length;
5137     *rx_end_shift = ml - offset
5138         - longest_length + (SvTAIL(sv_longest) != 0)
5139         + lookbehind;
5140
5141     t = (eol/* Can't have SEOL and MULTI */
5142          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5143     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5144
5145     return TRUE;
5146 }
5147
5148 /*
5149  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5150  * regular expression into internal code.
5151  * The pattern may be passed either as:
5152  *    a list of SVs (patternp plus pat_count)
5153  *    a list of OPs (expr)
5154  * If both are passed, the SV list is used, but the OP list indicates
5155  * which SVs are actually pre-compiled code blocks
5156  *
5157  * The SVs in the list have magic and qr overloading applied to them (and
5158  * the list may be modified in-place with replacement SVs in the latter
5159  * case).
5160  *
5161  * If the pattern hasn't changed from old_re, then old_re will be
5162  * returned.
5163  *
5164  * eng is the current engine. If that engine has an op_comp method, then
5165  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5166  * do the initial concatenation of arguments and pass on to the external
5167  * engine.
5168  *
5169  * If is_bare_re is not null, set it to a boolean indicating whether the
5170  * arg list reduced (after overloading) to a single bare regex which has
5171  * been returned (i.e. /$qr/).
5172  *
5173  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5174  *
5175  * pm_flags contains the PMf_* flags, typically based on those from the
5176  * pm_flags field of the related PMOP. Currently we're only interested in
5177  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5178  *
5179  * We can't allocate space until we know how big the compiled form will be,
5180  * but we can't compile it (and thus know how big it is) until we've got a
5181  * place to put the code.  So we cheat:  we compile it twice, once with code
5182  * generation turned off and size counting turned on, and once "for real".
5183  * This also means that we don't allocate space until we are sure that the
5184  * thing really will compile successfully, and we never have to move the
5185  * code and thus invalidate pointers into it.  (Note that it has to be in
5186  * one piece because free() must be able to free it all.) [NB: not true in perl]
5187  *
5188  * Beware that the optimization-preparation code in here knows about some
5189  * of the structure of the compiled regexp.  [I'll say.]
5190  */
5191
5192 REGEXP *
5193 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5194                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
5195                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5196 {
5197     dVAR;
5198     REGEXP *rx;
5199     struct regexp *r;
5200     regexp_internal *ri;
5201     STRLEN plen;
5202     char *exp;
5203     char* xend;
5204     regnode *scan;
5205     I32 flags;
5206     I32 minlen = 0;
5207     U32 rx_flags;
5208     SV *pat = NULL;
5209     SV *code_blocksv = NULL;
5210     SV** new_patternp = patternp;
5211
5212     /* these are all flags - maybe they should be turned
5213      * into a single int with different bit masks */
5214     I32 sawlookahead = 0;
5215     I32 sawplus = 0;
5216     I32 sawopen = 0;
5217     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5218     bool recompile = 0;
5219     bool runtime_code = 0;
5220     scan_data_t data;
5221     RExC_state_t RExC_state;
5222     RExC_state_t * const pRExC_state = &RExC_state;
5223 #ifdef TRIE_STUDY_OPT
5224     int restudied = 0;
5225     RExC_state_t copyRExC_state;
5226 #endif
5227     GET_RE_DEBUG_FLAGS_DECL;
5228
5229     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5230
5231     DEBUG_r(if (!PL_colorset) reginitcolors());
5232
5233 #ifndef PERL_IN_XSUB_RE
5234     /* Initialize these here instead of as-needed, as is quick and avoids
5235      * having to test them each time otherwise */
5236     if (! PL_AboveLatin1) {
5237         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5238         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5239         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5240
5241         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5242                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5243         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5244                                 = _new_invlist_C_array(PosixAlnum_invlist);
5245
5246         PL_L1Posix_ptrs[_CC_ALPHA]
5247                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5248         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5249
5250         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5251         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5252
5253         /* Cased is the same as Alpha in the ASCII range */
5254         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5255         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5256
5257         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5258         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5259
5260         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5261         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5262
5263         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5264         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5265
5266         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5267         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5268
5269         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5270         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5271
5272         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5273         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5274
5275         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5276         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5277         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5278         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5279
5280         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5281         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5282
5283         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5284
5285         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5286         PL_L1Posix_ptrs[_CC_WORDCHAR]
5287                                 = _new_invlist_C_array(L1PosixWord_invlist);
5288
5289         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5290         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5291
5292         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5293     }
5294 #endif
5295
5296     pRExC_state->code_blocks = NULL;
5297     pRExC_state->num_code_blocks = 0;
5298
5299     if (is_bare_re)
5300         *is_bare_re = FALSE;
5301
5302     if (expr && (expr->op_type == OP_LIST ||
5303                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5304         /* allocate code_blocks if needed */
5305         OP *o;
5306         int ncode = 0;
5307
5308         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5309             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5310                 ncode++; /* count of DO blocks */
5311         if (ncode) {
5312             pRExC_state->num_code_blocks = ncode;
5313             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5314         }
5315     }
5316
5317     if (!pat_count) {
5318         /* compile-time pattern with just OP_CONSTs and DO blocks */
5319
5320         int n;
5321         OP *o;
5322
5323         /* find how many CONSTs there are */
5324         assert(expr);
5325         n = 0;
5326         if (expr->op_type == OP_CONST)
5327             n = 1;
5328         else
5329             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5330                 if (o->op_type == OP_CONST)
5331                     n++;
5332             }
5333
5334         /* fake up an SV array */
5335
5336         assert(!new_patternp);
5337         Newx(new_patternp, n, SV*);
5338         SAVEFREEPV(new_patternp);
5339         pat_count = n;
5340
5341         n = 0;
5342         if (expr->op_type == OP_CONST)
5343             new_patternp[n] = cSVOPx_sv(expr);
5344         else
5345             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5346                 if (o->op_type == OP_CONST)
5347                     new_patternp[n++] = cSVOPo_sv;
5348             }
5349
5350     }
5351
5352     {
5353         /* concat args, handling magic, overloading etc */
5354
5355         SV **svp;
5356         OP *o = NULL;
5357         int n = 0;
5358         STRLEN orig_patlen = 0;
5359
5360         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5361             "Assembling pattern from %d elements%s\n", pat_count,
5362                 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5363
5364         /* apply magic and RE overloading to each arg */
5365         for (svp = new_patternp; svp < new_patternp + pat_count; svp++) {
5366             SV *rx = *svp;
5367             SvGETMAGIC(rx);
5368             if (SvROK(rx) && SvAMAGIC(rx)) {
5369                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5370                 if (sv) {
5371                     if (SvROK(sv))
5372                         sv = SvRV(sv);
5373                     if (SvTYPE(sv) != SVt_REGEXP)
5374                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5375                     *svp = sv;
5376                 }
5377             }
5378         }
5379
5380         if (pRExC_state->num_code_blocks) {
5381             if (expr->op_type == OP_CONST)
5382                 o = expr;
5383             else {
5384                 o = cLISTOPx(expr)->op_first;
5385                 assert(   o->op_type == OP_PUSHMARK
5386                        || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5387                        || o->op_type == OP_PADRANGE);
5388                 o = o->op_sibling;
5389             }
5390         }
5391
5392         if (pat_count > 1) {
5393
5394             pat = newSVpvn("", 0);
5395             SAVEFREESV(pat);
5396
5397             /* determine if the pattern is going to be utf8 (needed
5398              * in advance to align code block indices correctly).
5399              * XXX This could fail to be detected for an arg with
5400              * overloading but not concat overloading; but the main effect
5401              * in this obscure case is to need a 'use re eval' for a
5402              * literal code block */
5403             for (svp = new_patternp; svp < new_patternp + pat_count; svp++) {
5404                 if (SvUTF8(*svp))
5405                     SvUTF8_on(pat);
5406             }
5407         }
5408
5409         /* process args, concat them if there are multiple ones,
5410          * and find any code block indexes */
5411
5412
5413         for (svp = new_patternp; svp < new_patternp + pat_count; svp++) {
5414             SV *sv, *msv = *svp;
5415             SV *rx  = NULL;
5416             bool code = 0;
5417             /* we make the assumption here that each op in the list of
5418              * op_siblings maps to one SV pushed onto the stack,
5419              * except for code blocks, with have both an OP_NULL and
5420              * and OP_CONST.
5421              * This allows us to match up the list of SVs against the
5422              * list of OPs to find the next code block.
5423              *
5424              * Note that       PUSHMARK PADSV PADSV ..
5425              * is optimised to
5426              *                 PADRANGE NULL  NULL  ..
5427              * so the alignment still works. */
5428             if (o) {
5429                 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5430                     assert(n < pRExC_state->num_code_blocks);
5431                     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5432                     pRExC_state->code_blocks[n].block = o;
5433                     pRExC_state->code_blocks[n].src_regex = NULL;
5434                     n++;
5435                     code = 1;
5436                     o = o->op_sibling; /* skip CONST */
5437                     assert(o);
5438                 }
5439                 o = o->op_sibling;;
5440             }
5441
5442             /* try concatenation overload ... */
5443             if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5444                     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5445             {
5446                 sv_setsv(pat, sv);
5447                 /* overloading involved: all bets are off over literal
5448                  * code. Pretend we haven't seen it */
5449                 pRExC_state->num_code_blocks -= n;
5450                 n = 0;
5451             }
5452             else  {
5453                 /* ... or failing that, try "" overload */
5454                 while (SvAMAGIC(msv)
5455                         && (sv = AMG_CALLunary(msv, string_amg))
5456                         && sv != msv
5457                         &&  !(   SvROK(msv)
5458                               && SvROK(sv)
5459                               && SvRV(msv) == SvRV(sv))
5460                 ) {
5461                     msv = sv;
5462                     SvGETMAGIC(msv);
5463                 }
5464                 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5465                     msv = SvRV(msv);
5466                 if (pat) {
5467                     orig_patlen = SvCUR(pat);
5468                     sv_catsv_nomg(pat, msv);
5469                     rx = msv;
5470                 }
5471                 else
5472                     pat = msv;
5473                 if (code)
5474                     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5475             }
5476
5477             /* extract any code blocks within any embedded qr//'s */
5478             if (rx && SvTYPE(rx) == SVt_REGEXP
5479                 && RX_ENGINE((REGEXP*)rx)->op_comp)
5480             {
5481
5482                 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5483                 if (ri->num_code_blocks) {
5484                     int i;
5485                     /* the presence of an embedded qr// with code means
5486                      * we should always recompile: the text of the
5487                      * qr// may not have changed, but it may be a
5488                      * different closure than last time */
5489                     recompile = 1;
5490                     Renew(pRExC_state->code_blocks,
5491                         pRExC_state->num_code_blocks + ri->num_code_blocks,
5492                         struct reg_code_block);
5493                     pRExC_state->num_code_blocks += ri->num_code_blocks;
5494                     for (i=0; i < ri->num_code_blocks; i++) {
5495                         struct reg_code_block *src, *dst;
5496                         STRLEN offset =  orig_patlen
5497                             + ReANY((REGEXP *)rx)->pre_prefix;
5498                         assert(n < pRExC_state->num_code_blocks);
5499                         src = &ri->code_blocks[i];
5500                         dst = &pRExC_state->code_blocks[n];
5501                         dst->start          = src->start + offset;
5502                         dst->end            = src->end   + offset;
5503                         dst->block          = src->block;
5504                         dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5505                                                 src->src_regex
5506                                                     ? src->src_regex
5507                                                     : (REGEXP*)rx);
5508                         n++;
5509                     }
5510                 }
5511             }
5512         }
5513         if (pat_count > 1)
5514             SvSETMAGIC(pat);
5515
5516         /* handle bare (possibly after overloading) regex: foo =~ $re */
5517         {
5518             SV *re = pat;
5519             if (SvROK(re))
5520                 re = SvRV(re);
5521             if (SvTYPE(re) == SVt_REGEXP) {
5522                 if (is_bare_re)
5523                     *is_bare_re = TRUE;
5524                 SvREFCNT_inc(re);
5525                 Safefree(pRExC_state->code_blocks);
5526                 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5527                     "Precompiled pattern%s\n",
5528                         orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5529
5530                 return (REGEXP*)re;
5531             }
5532         }
5533     }
5534
5535     exp = SvPV_nomg(pat, plen);
5536     xend = exp + plen;
5537
5538     if (!eng->op_comp) {
5539         if ((SvUTF8(pat) && IN_BYTES)
5540                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5541         {
5542             /* make a temporary copy; either to convert to bytes,
5543              * or to avoid repeating get-magic / overloaded stringify */
5544             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5545                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5546         }
5547         Safefree(pRExC_state->code_blocks);
5548         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5549     }
5550
5551     /* ignore the utf8ness if the pattern is 0 length */
5552     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5553     RExC_uni_semantics = 0;
5554     RExC_contains_locale = 0;
5555     pRExC_state->runtime_code_qr = NULL;
5556
5557     DEBUG_COMPILE_r({
5558             SV *dsv= sv_newmortal();
5559             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5560             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5561                           PL_colors[4],PL_colors[5],s);
5562         });
5563
5564     if (0) {
5565       redo_first_pass:
5566         {
5567         U8 *const src = (U8*)exp;
5568         U8 *dst;
5569         int n=0;
5570         STRLEN s = 0, d = 0;
5571         bool do_end = 0;
5572
5573         /* It's possible to write a regexp in ascii that represents Unicode
5574         codepoints outside of the byte range, such as via \x{100}. If we
5575         detect such a sequence we have to convert the entire pattern to utf8
5576         and then recompile, as our sizing calculation will have been based
5577         on 1 byte == 1 character, but we will need to use utf8 to encode
5578         at least some part of the pattern, and therefore must convert the whole
5579         thing.
5580         -- dmq */
5581         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5582             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5583
5584         /* upgrade pattern to UTF8, and if there are code blocks,
5585          * recalculate the indices.
5586          * This is essentially an unrolled Perl_bytes_to_utf8() */
5587
5588         Newx(dst, plen * 2 + 1, U8);
5589
5590         while (s < plen) {
5591             const UV uv = NATIVE_TO_ASCII(src[s]);
5592             if (UNI_IS_INVARIANT(uv))
5593                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5594             else {
5595                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5596                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5597             }
5598             if (n < pRExC_state->num_code_blocks) {
5599                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5600                     pRExC_state->code_blocks[n].start = d;
5601                     assert(dst[d] == '(');
5602                     do_end = 1;
5603                 }
5604                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5605                     pRExC_state->code_blocks[n].end = d;
5606                     assert(dst[d] == ')');
5607                     do_end = 0;
5608                     n++;
5609                 }
5610             }
5611             s++;
5612             d++;
5613         }
5614         dst[d] = '\0';
5615         plen = d;
5616         exp = (char*) dst;
5617         xend = exp + plen;
5618         SAVEFREEPV(exp);
5619         RExC_orig_utf8 = RExC_utf8 = 1;
5620         }
5621     }
5622
5623     if ((pm_flags & PMf_USE_RE_EVAL)
5624                 /* this second condition covers the non-regex literal case,
5625                  * i.e.  $foo =~ '(?{})'. */
5626                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5627     )
5628         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5629
5630     /* return old regex if pattern hasn't changed */
5631     /* XXX: note in the below we have to check the flags as well as the pattern.
5632      *
5633      * Things get a touch tricky as we have to compare the utf8 flag independently
5634      * from the compile flags.
5635      */
5636
5637     if (   old_re
5638         && !recompile
5639         && !!RX_UTF8(old_re) == !!RExC_utf8
5640         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5641         && RX_PRECOMP(old_re)
5642         && RX_PRELEN(old_re) == plen
5643         && memEQ(RX_PRECOMP(old_re), exp, plen)
5644         && !runtime_code /* with runtime code, always recompile */ )
5645     {
5646         Safefree(pRExC_state->code_blocks);
5647         return old_re;
5648     }
5649
5650     rx_flags = orig_rx_flags;
5651
5652     if (initial_charset == REGEX_LOCALE_CHARSET) {
5653         RExC_contains_locale = 1;
5654     }
5655     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5656
5657         /* Set to use unicode semantics if the pattern is in utf8 and has the
5658          * 'depends' charset specified, as it means unicode when utf8  */
5659         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5660     }
5661
5662     RExC_precomp = exp;
5663     RExC_flags = rx_flags;
5664     RExC_pm_flags = pm_flags;
5665
5666     if (runtime_code) {
5667         if (TAINTING_get && TAINT_get)
5668             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5669
5670         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5671             /* whoops, we have a non-utf8 pattern, whilst run-time code
5672              * got compiled as utf8. Try again with a utf8 pattern */
5673             goto redo_first_pass;
5674         }
5675     }
5676     assert(!pRExC_state->runtime_code_qr);
5677
5678     RExC_sawback = 0;
5679
5680     RExC_seen = 0;
5681     RExC_in_lookbehind = 0;
5682     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5683     RExC_extralen = 0;
5684     RExC_override_recoding = 0;
5685     RExC_in_multi_char_class = 0;
5686
5687     /* First pass: determine size, legality. */
5688     RExC_parse = exp;
5689     RExC_start = exp;
5690     RExC_end = xend;
5691     RExC_naughty = 0;
5692     RExC_npar = 1;
5693     RExC_nestroot = 0;
5694     RExC_size = 0L;
5695     RExC_emit = &PL_regdummy;
5696     RExC_whilem_seen = 0;
5697     RExC_open_parens = NULL;
5698     RExC_close_parens = NULL;
5699     RExC_opend = NULL;
5700     RExC_paren_names = NULL;
5701 #ifdef DEBUGGING
5702     RExC_paren_name_list = NULL;
5703 #endif
5704     RExC_recurse = NULL;
5705     RExC_recurse_count = 0;
5706     pRExC_state->code_index = 0;
5707
5708 #if 0 /* REGC() is (currently) a NOP at the first pass.
5709        * Clever compilers notice this and complain. --jhi */
5710     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5711 #endif
5712     DEBUG_PARSE_r(
5713         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5714         RExC_lastnum=0;
5715         RExC_lastparse=NULL;
5716     );
5717     /* reg may croak on us, not giving us a chance to free
5718        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5719        need it to survive as long as the regexp (qr/(?{})/).
5720        We must check that code_blocksv is not already set, because we may
5721        have jumped back to restart the sizing pass. */
5722     if (pRExC_state->code_blocks && !code_blocksv) {
5723         code_blocksv = newSV_type(SVt_PV);
5724         SAVEFREESV(code_blocksv);
5725         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5726         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5727     }
5728     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5729         if (flags & RESTART_UTF8) {
5730             goto redo_first_pass;
5731         }
5732         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
5733     }
5734     if (code_blocksv)
5735         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5736
5737     DEBUG_PARSE_r({
5738         PerlIO_printf(Perl_debug_log,
5739             "Required size %"IVdf" nodes\n"
5740             "Starting second pass (creation)\n",
5741             (IV)RExC_size);
5742         RExC_lastnum=0;
5743         RExC_lastparse=NULL;
5744     });
5745
5746     /* The first pass could have found things that force Unicode semantics */
5747     if ((RExC_utf8 || RExC_uni_semantics)
5748          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5749     {
5750         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5751     }
5752
5753     /* Small enough for pointer-storage convention?
5754        If extralen==0, this means that we will not need long jumps. */
5755     if (RExC_size >= 0x10000L && RExC_extralen)
5756         RExC_size += RExC_extralen;
5757     else
5758         RExC_extralen = 0;
5759     if (RExC_whilem_seen > 15)
5760         RExC_whilem_seen = 15;
5761
5762     /* Allocate space and zero-initialize. Note, the two step process
5763        of zeroing when in debug mode, thus anything assigned has to
5764        happen after that */
5765     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5766     r = ReANY(rx);
5767     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5768          char, regexp_internal);
5769     if ( r == NULL || ri == NULL )
5770         FAIL("Regexp out of space");
5771 #ifdef DEBUGGING
5772     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5773     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5774 #else
5775     /* bulk initialize base fields with 0. */
5776     Zero(ri, sizeof(regexp_internal), char);
5777 #endif
5778
5779     /* non-zero initialization begins here */
5780     RXi_SET( r, ri );
5781     r->engine= eng;
5782     r->extflags = rx_flags;
5783     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5784
5785     if (pm_flags & PMf_IS_QR) {
5786         ri->code_blocks = pRExC_state->code_blocks;
5787         ri->num_code_blocks = pRExC_state->num_code_blocks;
5788     }
5789     else
5790     {
5791         int n;
5792         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5793             if (pRExC_state->code_blocks[n].src_regex)
5794                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5795         SAVEFREEPV(pRExC_state->code_blocks);
5796     }
5797
5798     {
5799         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5800         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5801
5802         /* The caret is output if there are any defaults: if not all the STD
5803          * flags are set, or if no character set specifier is needed */
5804         bool has_default =
5805                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5806                     || ! has_charset);
5807         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5808         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5809                             >> RXf_PMf_STD_PMMOD_SHIFT);
5810         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5811         char *p;
5812         /* Allocate for the worst case, which is all the std flags are turned
5813          * on.  If more precision is desired, we could do a population count of
5814          * the flags set.  This could be done with a small lookup table, or by
5815          * shifting, masking and adding, or even, when available, assembly
5816          * language for a machine-language population count.
5817          * We never output a minus, as all those are defaults, so are
5818          * covered by the caret */
5819         const STRLEN wraplen = plen + has_p + has_runon
5820             + has_default       /* If needs a caret */
5821
5822                 /* If needs a character set specifier */
5823             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5824             + (sizeof(STD_PAT_MODS) - 1)
5825             + (sizeof("(?:)") - 1);
5826
5827         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5828         r->xpv_len_u.xpvlenu_pv = p;
5829         if (RExC_utf8)
5830             SvFLAGS(rx) |= SVf_UTF8;
5831         *p++='('; *p++='?';
5832
5833         /* If a default, cover it using the caret */
5834         if (has_default) {
5835             *p++= DEFAULT_PAT_MOD;
5836         }
5837         if (has_charset) {
5838             STRLEN len;
5839             const char* const name = get_regex_charset_name(r->extflags, &len);
5840             Copy(name, p, len, char);
5841             p += len;
5842         }
5843         if (has_p)
5844             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5845         {
5846             char ch;
5847             while((ch = *fptr++)) {
5848                 if(reganch & 1)
5849                     *p++ = ch;
5850                 reganch >>= 1;
5851             }
5852         }
5853
5854         *p++ = ':';
5855         Copy(RExC_precomp, p, plen, char);
5856         assert ((RX_WRAPPED(rx) - p) < 16);
5857         r->pre_prefix = p - RX_WRAPPED(rx);
5858         p += plen;
5859         if (has_runon)
5860             *p++ = '\n';
5861         *p++ = ')';
5862         *p = 0;
5863         SvCUR_set(rx, p - RX_WRAPPED(rx));
5864     }
5865
5866     r->intflags = 0;
5867     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5868
5869     if (RExC_seen & REG_SEEN_RECURSE) {
5870         Newxz(RExC_open_parens, RExC_npar,regnode *);
5871         SAVEFREEPV(RExC_open_parens);
5872         Newxz(RExC_close_parens,RExC_npar,regnode *);
5873         SAVEFREEPV(RExC_close_parens);
5874     }
5875
5876     /* Useful during FAIL. */
5877 #ifdef RE_TRACK_PATTERN_OFFSETS
5878     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5879     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5880                           "%s %"UVuf" bytes for offset annotations.\n",
5881                           ri->u.offsets ? "Got" : "Couldn't get",
5882                           (UV)((2*RExC_size+1) * sizeof(U32))));
5883 #endif
5884     SetProgLen(ri,RExC_size);
5885     RExC_rx_sv = rx;
5886     RExC_rx = r;
5887     RExC_rxi = ri;
5888     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5889
5890     /* Second pass: emit code. */
5891     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5892     RExC_pm_flags = pm_flags;
5893     RExC_parse = exp;
5894     RExC_end = xend;
5895     RExC_naughty = 0;
5896     RExC_npar = 1;
5897     RExC_emit_start = ri->program;
5898     RExC_emit = ri->program;
5899     RExC_emit_bound = ri->program + RExC_size + 1;
5900     pRExC_state->code_index = 0;
5901
5902     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5903     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5904         ReREFCNT_dec(rx);
5905         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
5906     }
5907     /* XXXX To minimize changes to RE engine we always allocate
5908        3-units-long substrs field. */
5909     Newx(r->substrs, 1, struct reg_substr_data);
5910     if (RExC_recurse_count) {
5911         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5912         SAVEFREEPV(RExC_recurse);
5913     }
5914
5915 reStudy:
5916     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5917     Zero(r->substrs, 1, struct reg_substr_data);
5918
5919 #ifdef TRIE_STUDY_OPT
5920     if (!restudied) {
5921         StructCopy(&zero_scan_data, &data, scan_data_t);
5922         copyRExC_state = RExC_state;
5923     } else {
5924         U32 seen=RExC_seen;
5925         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5926
5927         RExC_state = copyRExC_state;
5928         if (seen & REG_TOP_LEVEL_BRANCHES)
5929             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5930         else
5931             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5932         StructCopy(&zero_scan_data, &data, scan_data_t);
5933     }
5934 #else
5935     StructCopy(&zero_scan_data, &data, scan_data_t);
5936 #endif
5937
5938     /* Dig out information for optimizations. */
5939     r->extflags = RExC_flags; /* was pm_op */
5940     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5941
5942     if (UTF)
5943         SvUTF8_on(rx);  /* Unicode in it? */
5944     ri->regstclass = NULL;
5945     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5946         r->intflags |= PREGf_NAUGHTY;
5947     scan = ri->program + 1;             /* First BRANCH. */
5948
5949     /* testing for BRANCH here tells us whether there is "must appear"
5950        data in the pattern. If there is then we can use it for optimisations */
5951     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5952         I32 fake;
5953         STRLEN longest_float_length, longest_fixed_length;
5954         struct regnode_charclass_class ch_class; /* pointed to by data */
5955         int stclass_flag;
5956         I32 last_close = 0; /* pointed to by data */
5957         regnode *first= scan;
5958         regnode *first_next= regnext(first);
5959         /*
5960          * Skip introductions and multiplicators >= 1
5961          * so that we can extract the 'meat' of the pattern that must
5962          * match in the large if() sequence following.
5963          * NOTE that EXACT is NOT covered here, as it is normally
5964          * picked up by the optimiser separately.
5965          *
5966          * This is unfortunate as the optimiser isnt handling lookahead
5967          * properly currently.
5968          *
5969          */
5970         while ((OP(first) == OPEN && (sawopen = 1)) ||
5971                /* An OR of *one* alternative - should not happen now. */
5972             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5973             /* for now we can't handle lookbehind IFMATCH*/
5974             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5975             (OP(first) == PLUS) ||
5976             (OP(first) == MINMOD) ||
5977                /* An {n,m} with n>0 */
5978             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5979             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5980         {
5981                 /*
5982                  * the only op that could be a regnode is PLUS, all the rest
5983                  * will be regnode_1 or regnode_2.
5984                  *
5985                  */
5986                 if (OP(first) == PLUS)
5987                     sawplus = 1;
5988                 else
5989                     first += regarglen[OP(first)];
5990
5991                 first = NEXTOPER(first);
5992                 first_next= regnext(first);
5993         }
5994
5995         /* Starting-point info. */
5996       again:
5997         DEBUG_PEEP("first:",first,0);
5998         /* Ignore EXACT as we deal with it later. */
5999         if (PL_regkind[OP(first)] == EXACT) {
6000             if (OP(first) == EXACT)
6001                 NOOP;   /* Empty, get anchored substr later. */
6002             else
6003                 ri->regstclass = first;
6004         }
6005 #ifdef TRIE_STCLASS
6006         else if (PL_regkind[OP(first)] == TRIE &&
6007                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6008         {
6009             regnode *trie_op;
6010             /* this can happen only on restudy */
6011             if ( OP(first) == TRIE ) {
6012                 struct regnode_1 *trieop = (struct regnode_1 *)
6013                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6014                 StructCopy(first,trieop,struct regnode_1);
6015                 trie_op=(regnode *)trieop;
6016             } else {
6017                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6018                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6019                 StructCopy(first,trieop,struct regnode_charclass);
6020                 trie_op=(regnode *)trieop;
6021             }
6022             OP(trie_op)+=2;
6023             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6024             ri->regstclass = trie_op;
6025         }
6026 #endif
6027         else if (REGNODE_SIMPLE(OP(first)))
6028             ri->regstclass = first;
6029         else if (PL_regkind[OP(first)] == BOUND ||
6030                  PL_regkind[OP(first)] == NBOUND)
6031             ri->regstclass = first;
6032         else if (PL_regkind[OP(first)] == BOL) {
6033             r->extflags |= (OP(first) == MBOL
6034                            ? RXf_ANCH_MBOL
6035                            : (OP(first) == SBOL
6036                               ? RXf_ANCH_SBOL
6037                               : RXf_ANCH_BOL));
6038             first = NEXTOPER(first);
6039             goto again;
6040         }
6041         else if (OP(first) == GPOS) {
6042             r->extflags |= RXf_ANCH_GPOS;
6043             first = NEXTOPER(first);
6044             goto again;
6045         }
6046         else if ((!sawopen || !RExC_sawback) &&
6047             (OP(first) == STAR &&
6048             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6049             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6050         {
6051             /* turn .* into ^.* with an implied $*=1 */
6052             const int type =
6053                 (OP(NEXTOPER(first)) == REG_ANY)
6054                     ? RXf_ANCH_MBOL
6055                     : RXf_ANCH_SBOL;
6056             r->extflags |= type;
6057             r->intflags |= PREGf_IMPLICIT;
6058             first = NEXTOPER(first);
6059             goto again;
6060         }
6061         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6062             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6063             /* x+ must match at the 1st pos of run of x's */
6064             r->intflags |= PREGf_SKIP;
6065
6066         /* Scan is after the zeroth branch, first is atomic matcher. */
6067 #ifdef TRIE_STUDY_OPT
6068         DEBUG_PARSE_r(
6069             if (!restudied)
6070                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6071                               (IV)(first - scan + 1))
6072         );
6073 #else
6074         DEBUG_PARSE_r(
6075             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6076                 (IV)(first - scan + 1))
6077         );
6078 #endif
6079
6080
6081         /*
6082         * If there's something expensive in the r.e., find the
6083         * longest literal string that must appear and make it the
6084         * regmust.  Resolve ties in favor of later strings, since
6085         * the regstart check works with the beginning of the r.e.
6086         * and avoiding duplication strengthens checking.  Not a
6087         * strong reason, but sufficient in the absence of others.
6088         * [Now we resolve ties in favor of the earlier string if
6089         * it happens that c_offset_min has been invalidated, since the
6090         * earlier string may buy us something the later one won't.]
6091         */
6092
6093         data.longest_fixed = newSVpvs("");
6094         data.longest_float = newSVpvs("");
6095         data.last_found = newSVpvs("");
6096         data.longest = &(data.longest_fixed);
6097         ENTER_with_name("study_chunk");
6098         SAVEFREESV(data.longest_fixed);
6099         SAVEFREESV(data.longest_float);
6100         SAVEFREESV(data.last_found);
6101         first = scan;
6102         if (!ri->regstclass) {
6103             cl_init(pRExC_state, &ch_class);
6104             data.start_class = &ch_class;
6105             stclass_flag = SCF_DO_STCLASS_AND;
6106         } else                          /* XXXX Check for BOUND? */
6107             stclass_flag = 0;
6108         data.last_closep = &last_close;
6109
6110         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6111             &data, -1, NULL, NULL,
6112             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6113
6114
6115         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6116
6117
6118         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6119              && data.last_start_min == 0 && data.last_end > 0
6120              && !RExC_seen_zerolen
6121              && !(RExC_seen & REG_SEEN_VERBARG)
6122              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6123             r->extflags |= RXf_CHECK_ALL;
6124         scan_commit(pRExC_state, &data,&minlen,0);
6125
6126         longest_float_length = CHR_SVLEN(data.longest_float);
6127
6128         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6129                    && data.offset_fixed == data.offset_float_min
6130                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6131             && S_setup_longest (aTHX_ pRExC_state,
6132                                     data.longest_float,
6133                                     &(r->float_utf8),
6134                                     &(r->float_substr),
6135                                     &(r->float_end_shift),
6136                                     data.lookbehind_float,
6137                                     data.offset_float_min,
6138                                     data.minlen_float,
6139                                     longest_float_length,
6140                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6141                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6142         {
6143             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6144             r->float_max_offset = data.offset_float_max;
6145             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6146                 r->float_max_offset -= data.lookbehind_float;
6147             SvREFCNT_inc_simple_void_NN(data.longest_float);
6148         }
6149         else {
6150             r->float_substr = r->float_utf8 = NULL;
6151             longest_float_length = 0;
6152         }
6153
6154         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6155
6156         if (S_setup_longest (aTHX_ pRExC_state,
6157                                 data.longest_fixed,
6158                                 &(r->anchored_utf8),
6159                                 &(r->anchored_substr),
6160                                 &(r->anchored_end_shift),
6161                                 data.lookbehind_fixed,
6162                                 data.offset_fixed,
6163                                 data.minlen_fixed,
6164                                 longest_fixed_length,
6165                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6166                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6167         {
6168             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6169             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6170         }
6171         else {
6172             r->anchored_substr = r->anchored_utf8 = NULL;
6173             longest_fixed_length = 0;
6174         }
6175         LEAVE_with_name("study_chunk");
6176
6177         if (ri->regstclass
6178             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6179             ri->regstclass = NULL;
6180
6181         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6182             && stclass_flag
6183             && ! TEST_SSC_EOS(data.start_class)
6184             && !cl_is_anything(data.start_class))
6185         {
6186             const U32 n = add_data(pRExC_state, 1, "f");
6187             OP(data.start_class) = ANYOF_SYNTHETIC;
6188
6189             Newx(RExC_rxi->data->data[n], 1,
6190                 struct regnode_charclass_class);
6191             StructCopy(data.start_class,
6192                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6193                        struct regnode_charclass_class);
6194             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6195             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6196             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6197                       regprop(r, sv, (regnode*)data.start_class);
6198                       PerlIO_printf(Perl_debug_log,
6199                                     "synthetic stclass \"%s\".\n",
6200                                     SvPVX_const(sv));});
6201         }
6202
6203         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6204         if (longest_fixed_length > longest_float_length) {
6205             r->check_end_shift = r->anchored_end_shift;
6206             r->check_substr = r->anchored_substr;
6207             r->check_utf8 = r->anchored_utf8;
6208             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6209             if (r->extflags & RXf_ANCH_SINGLE)
6210                 r->extflags |= RXf_NOSCAN;
6211         }
6212         else {
6213             r->check_end_shift = r->float_end_shift;
6214             r->check_substr = r->float_substr;
6215             r->check_utf8 = r->float_utf8;
6216             r->check_offset_min = r->float_min_offset;
6217             r->check_offset_max = r->float_max_offset;
6218         }
6219         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6220            This should be changed ASAP!  */
6221         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6222             r->extflags |= RXf_USE_INTUIT;
6223             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6224                 r->extflags |= RXf_INTUIT_TAIL;
6225         }
6226         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6227         if ( (STRLEN)minlen < longest_float_length )
6228             minlen= longest_float_length;
6229         if ( (STRLEN)minlen < longest_fixed_length )
6230             minlen= longest_fixed_length;
6231         */
6232     }
6233     else {
6234         /* Several toplevels. Best we can is to set minlen. */
6235         I32 fake;
6236         struct regnode_charclass_class ch_class;
6237         I32 last_close = 0;
6238
6239         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6240
6241         scan = ri->program + 1;
6242         cl_init(pRExC_state, &ch_class);
6243         data.start_class = &ch_class;
6244         data.last_closep = &last_close;
6245
6246
6247         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6248             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6249
6250         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6251
6252         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6253                 = r->float_substr = r->float_utf8 = NULL;
6254
6255         if (! TEST_SSC_EOS(data.start_class)
6256             && !cl_is_anything(data.start_class))
6257         {
6258             const U32 n = add_data(pRExC_state, 1, "f");
6259             OP(data.start_class) = ANYOF_SYNTHETIC;
6260
6261             Newx(RExC_rxi->data->data[n], 1,
6262                 struct regnode_charclass_class);
6263             StructCopy(data.start_class,
6264                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6265                        struct regnode_charclass_class);
6266             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6267             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6268             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6269                       regprop(r, sv, (regnode*)data.start_class);
6270                       PerlIO_printf(Perl_debug_log,
6271                                     "synthetic stclass \"%s\".\n",
6272                                     SvPVX_const(sv));});
6273         }
6274     }
6275
6276     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6277        the "real" pattern. */
6278     DEBUG_OPTIMISE_r({
6279         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6280                       (IV)minlen, (IV)r->minlen);
6281     });
6282     r->minlenret = minlen;
6283     if (r->minlen < minlen)
6284         r->minlen = minlen;
6285
6286     if (RExC_seen & REG_SEEN_GPOS)
6287         r->extflags |= RXf_GPOS_SEEN;
6288     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6289         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6290     if (pRExC_state->num_code_blocks)
6291         r->extflags |= RXf_EVAL_SEEN;
6292     if (RExC_seen & REG_SEEN_CANY)
6293         r->extflags |= RXf_CANY_SEEN;
6294     if (RExC_seen & REG_SEEN_VERBARG)
6295     {
6296         r->intflags |= PREGf_VERBARG_SEEN;
6297         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6298     }
6299     if (RExC_seen & REG_SEEN_CUTGROUP)
6300         r->intflags |= PREGf_CUTGROUP_SEEN;
6301     if (pm_flags & PMf_USE_RE_EVAL)
6302         r->intflags |= PREGf_USE_RE_EVAL;
6303     if (RExC_paren_names)
6304         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6305     else
6306         RXp_PAREN_NAMES(r) = NULL;
6307
6308     {
6309         regnode *first = ri->program + 1;
6310         U8 fop = OP(first);
6311         regnode *next = NEXTOPER(first);
6312         U8 nop = OP(next);
6313
6314         if (PL_regkind[fop] == NOTHING && nop == END)
6315             r->extflags |= RXf_NULL;
6316         else if (PL_regkind[fop] == BOL && nop == END)
6317             r->extflags |= RXf_START_ONLY;
6318         else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6319             r->extflags |= RXf_WHITE;
6320         else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6321             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6322
6323     }
6324 #ifdef DEBUGGING
6325     if (RExC_paren_names) {
6326         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6327         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6328     } else
6329 #endif
6330         ri->name_list_idx = 0;
6331
6332     if (RExC_recurse_count) {
6333         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6334             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6335             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6336         }
6337     }
6338     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6339     /* assume we don't need to swap parens around before we match */
6340
6341     DEBUG_DUMP_r({
6342         PerlIO_printf(Perl_debug_log,"Final program:\n");
6343         regdump(r);
6344     });
6345 #ifdef RE_TRACK_PATTERN_OFFSETS
6346     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6347         const U32 len = ri->u.offsets[0];
6348         U32 i;
6349         GET_RE_DEBUG_FLAGS_DECL;
6350         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6351         for (i = 1; i <= len; i++) {
6352             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6353                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6354                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6355             }
6356         PerlIO_printf(Perl_debug_log, "\n");
6357     });
6358 #endif
6359
6360 #ifdef USE_ITHREADS
6361     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6362      * by setting the regexp SV to readonly-only instead. If the
6363      * pattern's been recompiled, the USEDness should remain. */
6364     if (old_re && SvREADONLY(old_re))
6365         SvREADONLY_on(rx);
6366 #endif
6367     return rx;
6368 }
6369
6370
6371 SV*
6372 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6373                     const U32 flags)
6374 {
6375     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6376
6377     PERL_UNUSED_ARG(value);
6378
6379     if (flags & RXapif_FETCH) {
6380         return reg_named_buff_fetch(rx, key, flags);
6381     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6382         Perl_croak_no_modify();
6383         return NULL;
6384     } else if (flags & RXapif_EXISTS) {
6385         return reg_named_buff_exists(rx, key, flags)
6386             ? &PL_sv_yes
6387             : &PL_sv_no;
6388     } else if (flags & RXapif_REGNAMES) {
6389         return reg_named_buff_all(rx, flags);
6390     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6391         return reg_named_buff_scalar(rx, flags);
6392     } else {
6393         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6394         return NULL;
6395     }
6396 }
6397
6398 SV*
6399 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6400                          const U32 flags)
6401 {
6402     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6403     PERL_UNUSED_ARG(lastkey);
6404
6405     if (flags & RXapif_FIRSTKEY)
6406         return reg_named_buff_firstkey(rx, flags);
6407     else if (flags & RXapif_NEXTKEY)
6408         return reg_named_buff_nextkey(rx, flags);
6409     else {
6410         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6411         return NULL;
6412     }
6413 }
6414
6415 SV*
6416 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6417                           const U32 flags)
6418 {
6419     AV *retarray = NULL;
6420     SV *ret;
6421     struct regexp *const rx = ReANY(r);
6422
6423     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6424
6425     if (flags & RXapif_ALL)
6426         retarray=newAV();
6427
6428     if (rx && RXp_PAREN_NAMES(rx)) {
6429         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6430         if (he_str) {
6431             IV i;
6432             SV* sv_dat=HeVAL(he_str);
6433             I32 *nums=(I32*)SvPVX(sv_dat);
6434             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6435                 if ((I32)(rx->nparens) >= nums[i]
6436                     && rx->offs[nums[i]].start != -1
6437                     && rx->offs[nums[i]].end != -1)
6438                 {
6439                     ret = newSVpvs("");
6440                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6441                     if (!retarray)
6442                         return ret;
6443                 } else {
6444                     if (retarray)
6445                         ret = newSVsv(&PL_sv_undef);
6446                 }
6447                 if (retarray)
6448                     av_push(retarray, ret);
6449             }
6450             if (retarray)
6451                 return newRV_noinc(MUTABLE_SV(retarray));
6452         }
6453     }
6454     return NULL;
6455 }
6456
6457 bool
6458 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6459                            const U32 flags)
6460 {
6461     struct regexp *const rx = ReANY(r);
6462
6463     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6464
6465     if (rx && RXp_PAREN_NAMES(rx)) {
6466         if (flags & RXapif_ALL) {
6467             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6468         } else {
6469             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6470             if (sv) {
6471                 SvREFCNT_dec_NN(sv);
6472                 return TRUE;
6473             } else {
6474                 return FALSE;
6475             }
6476         }
6477     } else {
6478         return FALSE;
6479     }
6480 }
6481
6482 SV*
6483 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6484 {
6485     struct regexp *const rx = ReANY(r);
6486
6487     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6488
6489     if ( rx && RXp_PAREN_NAMES(rx) ) {
6490         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6491
6492         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6493     } else {
6494         return FALSE;
6495     }
6496 }
6497
6498 SV*
6499 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6500 {
6501     struct regexp *const rx = ReANY(r);
6502     GET_RE_DEBUG_FLAGS_DECL;
6503
6504     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6505
6506     if (rx && RXp_PAREN_NAMES(rx)) {
6507         HV *hv = RXp_PAREN_NAMES(rx);
6508         HE *temphe;
6509         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6510             IV i;
6511             IV parno = 0;
6512             SV* sv_dat = HeVAL(temphe);
6513             I32 *nums = (I32*)SvPVX(sv_dat);
6514             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6515                 if ((I32)(rx->lastparen) >= nums[i] &&
6516                     rx->offs[nums[i]].start != -1 &&
6517                     rx->offs[nums[i]].end != -1)
6518                 {
6519                     parno = nums[i];
6520                     break;
6521                 }
6522             }
6523             if (parno || flags & RXapif_ALL) {
6524                 return newSVhek(HeKEY_hek(temphe));
6525             }
6526         }
6527     }
6528     return NULL;
6529 }
6530
6531 SV*
6532 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6533 {
6534     SV *ret;
6535     AV *av;
6536     I32 length;
6537     struct regexp *const rx = ReANY(r);
6538
6539     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6540
6541     if (rx && RXp_PAREN_NAMES(rx)) {
6542         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6543             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6544         } else if (flags & RXapif_ONE) {
6545             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6546             av = MUTABLE_AV(SvRV(ret));
6547             length = av_len(av);
6548             SvREFCNT_dec_NN(ret);
6549             return newSViv(length + 1);
6550         } else {
6551             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6552             return NULL;
6553         }
6554     }
6555     return &PL_sv_undef;
6556 }
6557
6558 SV*
6559 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6560 {
6561     struct regexp *const rx = ReANY(r);
6562     AV *av = newAV();
6563
6564     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6565
6566     if (rx && RXp_PAREN_NAMES(rx)) {
6567         HV *hv= RXp_PAREN_NAMES(rx);
6568         HE *temphe;
6569         (void)hv_iterinit(hv);
6570         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6571             IV i;
6572             IV parno = 0;
6573             SV* sv_dat = HeVAL(temphe);
6574             I32 *nums = (I32*)SvPVX(sv_dat);
6575             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6576                 if ((I32)(rx->lastparen) >= nums[i] &&
6577                     rx->offs[nums[i]].start != -1 &&
6578                     rx->offs[nums[i]].end != -1)
6579                 {
6580                     parno = nums[i];
6581                     break;
6582                 }
6583             }
6584             if (parno || flags & RXapif_ALL) {
6585                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6586             }
6587         }
6588     }
6589
6590     return newRV_noinc(MUTABLE_SV(av));
6591 }
6592
6593 void
6594 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6595                              SV * const sv)
6596 {
6597     struct regexp *const rx = ReANY(r);
6598     char *s = NULL;
6599     I32 i = 0;
6600     I32 s1, t1;
6601     I32 n = paren;
6602
6603     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6604
6605     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6606            || n == RX_BUFF_IDX_CARET_FULLMATCH
6607            || n == RX_BUFF_IDX_CARET_POSTMATCH
6608          )
6609          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6610     )
6611         goto ret_undef;
6612
6613     if (!rx->subbeg)
6614         goto ret_undef;
6615
6616     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6617         /* no need to distinguish between them any more */
6618         n = RX_BUFF_IDX_FULLMATCH;
6619
6620     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6621         && rx->offs[0].start != -1)
6622     {
6623         /* $`, ${^PREMATCH} */
6624         i = rx->offs[0].start;
6625         s = rx->subbeg;
6626     }
6627     else
6628     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6629         && rx->offs[0].end != -1)
6630     {
6631         /* $', ${^POSTMATCH} */
6632         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6633         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6634     }
6635     else
6636     if ( 0 <= n && n <= (I32)rx->nparens &&
6637         (s1 = rx->offs[n].start) != -1 &&
6638         (t1 = rx->offs[n].end) != -1)
6639     {
6640         /* $&, ${^MATCH},  $1 ... */
6641         i = t1 - s1;
6642         s = rx->subbeg + s1 - rx->suboffset;
6643     } else {
6644         goto ret_undef;
6645     }
6646
6647     assert(s >= rx->subbeg);
6648     assert(rx->sublen >= (s - rx->subbeg) + i );
6649     if (i >= 0) {
6650 #if NO_TAINT_SUPPORT
6651         sv_setpvn(sv, s, i);
6652 #else
6653         const int oldtainted = TAINT_get;
6654         TAINT_NOT;
6655         sv_setpvn(sv, s, i);
6656         TAINT_set(oldtainted);
6657 #endif
6658         if ( (rx->extflags & RXf_CANY_SEEN)
6659             ? (RXp_MATCH_UTF8(rx)
6660                         && (!i || is_utf8_string((U8*)s, i)))
6661             : (RXp_MATCH_UTF8(rx)) )
6662         {
6663             SvUTF8_on(sv);
6664         }
6665         else
6666             SvUTF8_off(sv);
6667         if (TAINTING_get) {
6668             if (RXp_MATCH_TAINTED(rx)) {
6669                 if (SvTYPE(sv) >= SVt_PVMG) {
6670                     MAGIC* const mg = SvMAGIC(sv);
6671                     MAGIC* mgt;
6672                     TAINT;
6673                     SvMAGIC_set(sv, mg->mg_moremagic);
6674                     SvTAINT(sv);
6675                     if ((mgt = SvMAGIC(sv))) {
6676                         mg->mg_moremagic = mgt;
6677                         SvMAGIC_set(sv, mg);
6678                     }
6679                 } else {
6680                     TAINT;
6681                     SvTAINT(sv);
6682                 }
6683             } else
6684                 SvTAINTED_off(sv);
6685         }
6686     } else {
6687       ret_undef:
6688         sv_setsv(sv,&PL_sv_undef);
6689         return;
6690     }
6691 }
6692
6693 void
6694 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6695                                                          SV const * const value)
6696 {
6697     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6698
6699     PERL_UNUSED_ARG(rx);
6700     PERL_UNUSED_ARG(paren);
6701     PERL_UNUSED_ARG(value);
6702
6703     if (!PL_localizing)
6704         Perl_croak_no_modify();
6705 }
6706
6707 I32
6708 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6709                               const I32 paren)
6710 {
6711     struct regexp *const rx = ReANY(r);
6712     I32 i;
6713     I32 s1, t1;
6714
6715     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6716
6717     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6718     switch (paren) {
6719       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6720          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6721             goto warn_undef;
6722         /*FALLTHROUGH*/
6723
6724       case RX_BUFF_IDX_PREMATCH:       /* $` */
6725         if (rx->offs[0].start != -1) {
6726                         i = rx->offs[0].start;
6727                         if (i > 0) {
6728                                 s1 = 0;
6729                                 t1 = i;
6730                                 goto getlen;
6731                         }
6732             }
6733         return 0;
6734
6735       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6736          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6737             goto warn_undef;
6738       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6739             if (rx->offs[0].end != -1) {
6740                         i = rx->sublen - rx->offs[0].end;
6741                         if (i > 0) {
6742                                 s1 = rx->offs[0].end;
6743                                 t1 = rx->sublen;
6744                                 goto getlen;
6745                         }
6746             }
6747         return 0;
6748
6749       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6750          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6751             goto warn_undef;
6752         /*FALLTHROUGH*/
6753
6754       /* $& / ${^MATCH}, $1, $2, ... */
6755       default:
6756             if (paren <= (I32)rx->nparens &&
6757             (s1 = rx->offs[paren].start) != -1 &&
6758             (t1 = rx->offs[paren].end) != -1)
6759             {
6760             i = t1 - s1;
6761             goto getlen;
6762         } else {
6763           warn_undef:
6764             if (ckWARN(WARN_UNINITIALIZED))
6765                 report_uninit((const SV *)sv);
6766             return 0;
6767         }
6768     }
6769   getlen:
6770     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6771         const char * const s = rx->subbeg - rx->suboffset + s1;
6772         const U8 *ep;
6773         STRLEN el;
6774
6775         i = t1 - s1;
6776         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6777                         i = el;
6778     }
6779     return i;
6780 }
6781
6782 SV*
6783 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6784 {
6785     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6786         PERL_UNUSED_ARG(rx);
6787         if (0)
6788             return NULL;
6789         else
6790             return newSVpvs("Regexp");
6791 }
6792
6793 /* Scans the name of a named buffer from the pattern.
6794  * If flags is REG_RSN_RETURN_NULL returns null.
6795  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6796  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6797  * to the parsed name as looked up in the RExC_paren_names hash.
6798  * If there is an error throws a vFAIL().. type exception.
6799  */
6800
6801 #define REG_RSN_RETURN_NULL    0
6802 #define REG_RSN_RETURN_NAME    1
6803 #define REG_RSN_RETURN_DATA    2
6804
6805 STATIC SV*
6806 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6807 {
6808     char *name_start = RExC_parse;
6809
6810     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6811
6812     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6813          /* skip IDFIRST by using do...while */
6814         if (UTF)
6815             do {
6816                 RExC_parse += UTF8SKIP(RExC_parse);
6817             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6818         else
6819             do {
6820                 RExC_parse++;
6821             } while (isWORDCHAR(*RExC_parse));
6822     } else {
6823         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6824         vFAIL("Group name must start with a non-digit word character");
6825     }
6826     if ( flags ) {
6827         SV* sv_name
6828             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6829                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6830         if ( flags == REG_RSN_RETURN_NAME)
6831             return sv_name;
6832         else if (flags==REG_RSN_RETURN_DATA) {
6833             HE *he_str = NULL;
6834             SV *sv_dat = NULL;
6835             if ( ! sv_name )      /* should not happen*/
6836                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6837             if (RExC_paren_names)
6838                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6839             if ( he_str )
6840                 sv_dat = HeVAL(he_str);
6841             if ( ! sv_dat )
6842                 vFAIL("Reference to nonexistent named group");
6843             return sv_dat;
6844         }
6845         else {
6846             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6847                        (unsigned long) flags);
6848         }
6849         assert(0); /* NOT REACHED */
6850     }
6851     return NULL;
6852 }
6853
6854 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6855     int rem=(int)(RExC_end - RExC_parse);                       \
6856     int cut;                                                    \
6857     int num;                                                    \
6858     int iscut=0;                                                \
6859     if (rem>10) {                                               \
6860         rem=10;                                                 \
6861         iscut=1;                                                \
6862     }                                                           \
6863     cut=10-rem;                                                 \
6864     if (RExC_lastparse!=RExC_parse)                             \
6865         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6866             rem, RExC_parse,                                    \
6867             cut + 4,                                            \
6868             iscut ? "..." : "<"                                 \
6869         );                                                      \
6870     else                                                        \
6871         PerlIO_printf(Perl_debug_log,"%16s","");                \
6872                                                                 \
6873     if (SIZE_ONLY)                                              \
6874        num = RExC_size + 1;                                     \
6875     else                                                        \
6876        num=REG_NODE_NUM(RExC_emit);                             \
6877     if (RExC_lastnum!=num)                                      \
6878        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6879     else                                                        \
6880        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6881     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6882         (int)((depth*2)), "",                                   \
6883         (funcname)                                              \
6884     );                                                          \
6885     RExC_lastnum=num;                                           \
6886     RExC_lastparse=RExC_parse;                                  \
6887 })
6888
6889
6890
6891 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6892     DEBUG_PARSE_MSG((funcname));                            \
6893     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6894 })
6895 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6896     DEBUG_PARSE_MSG((funcname));                            \
6897     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6898 })
6899
6900 /* This section of code defines the inversion list object and its methods.  The
6901  * interfaces are highly subject to change, so as much as possible is static to
6902  * this file.  An inversion list is here implemented as a malloc'd C UV array
6903  * with some added info that is placed as UVs at the beginning in a header
6904  * portion.  An inversion list for Unicode is an array of code points, sorted
6905  * by ordinal number.  The zeroth element is the first code point in the list.
6906  * The 1th element is the first element beyond that not in the list.  In other
6907  * words, the first range is
6908  *  invlist[0]..(invlist[1]-1)
6909  * The other ranges follow.  Thus every element whose index is divisible by two
6910  * marks the beginning of a range that is in the list, and every element not
6911  * divisible by two marks the beginning of a range not in the list.  A single
6912  * element inversion list that contains the single code point N generally
6913  * consists of two elements
6914  *  invlist[0] == N
6915  *  invlist[1] == N+1
6916  * (The exception is when N is the highest representable value on the
6917  * machine, in which case the list containing just it would be a single
6918  * element, itself.  By extension, if the last range in the list extends to
6919  * infinity, then the first element of that range will be in the inversion list
6920  * at a position that is divisible by two, and is the final element in the
6921  * list.)
6922  * Taking the complement (inverting) an inversion list is quite simple, if the
6923  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6924  * This implementation reserves an element at the beginning of each inversion
6925  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6926  * actual beginning of the list is either that element if 0, or the next one if
6927  * 1.
6928  *
6929  * More about inversion lists can be found in "Unicode Demystified"
6930  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6931  * More will be coming when functionality is added later.
6932  *
6933  * The inversion list data structure is currently implemented as an SV pointing
6934  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6935  * array of UV whose memory management is automatically handled by the existing
6936  * facilities for SV's.
6937  *
6938  * Some of the methods should always be private to the implementation, and some
6939  * should eventually be made public */
6940
6941 /* The header definitions are in F<inline_invlist.c> */
6942 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
6943 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
6944
6945 #define INVLIST_INITIAL_LEN 10
6946
6947 PERL_STATIC_INLINE UV*
6948 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6949 {
6950     /* Returns a pointer to the first element in the inversion list's array.
6951      * This is called upon initialization of an inversion list.  Where the
6952      * array begins depends on whether the list has the code point U+0000
6953      * in it or not.  The other parameter tells it whether the code that
6954      * follows this call is about to put a 0 in the inversion list or not.
6955      * The first element is either the element with 0, if 0, or the next one,
6956      * if 1 */
6957
6958     UV* zero = get_invlist_zero_addr(invlist);
6959
6960     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6961
6962     /* Must be empty */
6963     assert(! *_get_invlist_len_addr(invlist));
6964
6965     /* 1^1 = 0; 1^0 = 1 */
6966     *zero = 1 ^ will_have_0;
6967     return zero + *zero;
6968 }
6969
6970 PERL_STATIC_INLINE UV*
6971 S_invlist_array(pTHX_ SV* const invlist)
6972 {
6973     /* Returns the pointer to the inversion list's array.  Every time the
6974      * length changes, this needs to be called in case malloc or realloc moved
6975      * it */
6976
6977     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6978
6979     /* Must not be empty.  If these fail, you probably didn't check for <len>
6980      * being non-zero before trying to get the array */
6981     assert(*_get_invlist_len_addr(invlist));
6982     assert(*get_invlist_zero_addr(invlist) == 0
6983            || *get_invlist_zero_addr(invlist) == 1);
6984
6985     /* The array begins either at the element reserved for zero if the
6986      * list contains 0 (that element will be set to 0), or otherwise the next
6987      * element (in which case the reserved element will be set to 1). */
6988     return (UV *) (get_invlist_zero_addr(invlist)
6989                    + *get_invlist_zero_addr(invlist));
6990 }
6991
6992 PERL_STATIC_INLINE void
6993 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6994 {
6995     /* Sets the current number of elements stored in the inversion list */
6996
6997     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6998
6999     *_get_invlist_len_addr(invlist) = len;
7000
7001     assert(len <= SvLEN(invlist));
7002
7003     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7004     /* If the list contains U+0000, that element is part of the header,
7005      * and should not be counted as part of the array.  It will contain
7006      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7007      * subtract:
7008      *  SvCUR_set(invlist,
7009      *            TO_INTERNAL_SIZE(len
7010      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7011      * But, this is only valid if len is not 0.  The consequences of not doing
7012      * this is that the memory allocation code may think that 1 more UV is
7013      * being used than actually is, and so might do an unnecessary grow.  That
7014      * seems worth not bothering to make this the precise amount.
7015      *
7016      * Note that when inverting, SvCUR shouldn't change */
7017 }
7018
7019 PERL_STATIC_INLINE IV*
7020 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7021 {
7022     /* Return the address of the UV that is reserved to hold the cached index
7023      * */
7024
7025     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7026
7027     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7028 }
7029
7030 PERL_STATIC_INLINE IV
7031 S_invlist_previous_index(pTHX_ SV* const invlist)
7032 {
7033     /* Returns cached index of previous search */
7034
7035     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7036
7037     return *get_invlist_previous_index_addr(invlist);
7038 }
7039
7040 PERL_STATIC_INLINE void
7041 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7042 {
7043     /* Caches <index> for later retrieval */
7044
7045     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7046
7047     assert(index == 0 || index < (int) _invlist_len(invlist));
7048
7049     *get_invlist_previous_index_addr(invlist) = index;
7050 }
7051
7052 PERL_STATIC_INLINE UV
7053 S_invlist_max(pTHX_ SV* const invlist)
7054 {
7055     /* Returns the maximum number of elements storable in the inversion list's
7056      * array, without having to realloc() */
7057
7058     PERL_ARGS_ASSERT_INVLIST_MAX;
7059
7060     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7061            ? _invlist_len(invlist)
7062            : FROM_INTERNAL_SIZE(SvLEN(invlist));
7063 }
7064
7065 PERL_STATIC_INLINE UV*
7066 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7067 {
7068     /* Return the address of the UV that is reserved to hold 0 if the inversion
7069      * list contains 0.  This has to be the last element of the heading, as the
7070      * list proper starts with either it if 0, or the next element if not.
7071      * (But we force it to contain either 0 or 1) */
7072
7073     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7074
7075     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7076 }
7077
7078 #ifndef PERL_IN_XSUB_RE
7079 SV*
7080 Perl__new_invlist(pTHX_ IV initial_size)
7081 {
7082
7083     /* Return a pointer to a newly constructed inversion list, with enough
7084      * space to store 'initial_size' elements.  If that number is negative, a
7085      * system default is used instead */
7086
7087     SV* new_list;
7088
7089     if (initial_size < 0) {
7090         initial_size = INVLIST_INITIAL_LEN;
7091     }
7092
7093     /* Allocate the initial space */
7094     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7095     invlist_set_len(new_list, 0);
7096
7097     /* Force iterinit() to be used to get iteration to work */
7098     *get_invlist_iter_addr(new_list) = UV_MAX;
7099
7100     /* This should force a segfault if a method doesn't initialize this
7101      * properly */
7102     *get_invlist_zero_addr(new_list) = UV_MAX;
7103
7104     *get_invlist_previous_index_addr(new_list) = 0;
7105     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7106 #if HEADER_LENGTH != 5
7107 #   error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7108 #endif
7109
7110     return new_list;
7111 }
7112 #endif
7113
7114 STATIC SV*
7115 S__new_invlist_C_array(pTHX_ UV* list)
7116 {
7117     /* Return a pointer to a newly constructed inversion list, initialized to
7118      * point to <list>, which has to be in the exact correct inversion list
7119      * form, including internal fields.  Thus this is a dangerous routine that
7120      * should not be used in the wrong hands */
7121
7122     SV* invlist = newSV_type(SVt_PV);
7123
7124     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7125
7126     SvPV_set(invlist, (char *) list);
7127     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7128                                shouldn't touch it */
7129     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7130
7131     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7132         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7133     }
7134
7135     /* Initialize the iteration pointer.
7136      * XXX This could be done at compile time in charclass_invlists.h, but I
7137      * (khw) am not confident that the suffixes for specifying the C constant
7138      * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7139      * to use 64 bits; might need a Configure probe */
7140     invlist_iterfinish(invlist);
7141
7142     return invlist;
7143 }
7144
7145 STATIC void
7146 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7147 {
7148     /* Grow the maximum size of an inversion list */
7149
7150     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7151
7152     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7153 }
7154
7155 PERL_STATIC_INLINE void
7156 S_invlist_trim(pTHX_ SV* const invlist)
7157 {
7158     PERL_ARGS_ASSERT_INVLIST_TRIM;
7159
7160     /* Change the length of the inversion list to how many entries it currently
7161      * has */
7162
7163     SvPV_shrink_to_cur((SV *) invlist);
7164 }
7165
7166 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7167
7168 STATIC void
7169 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7170 {
7171    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7172     * the end of the inversion list.  The range must be above any existing
7173     * ones. */
7174
7175     UV* array;
7176     UV max = invlist_max(invlist);
7177     UV len = _invlist_len(invlist);
7178
7179     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7180
7181     if (len == 0) { /* Empty lists must be initialized */
7182         array = _invlist_array_init(invlist, start == 0);
7183     }
7184     else {
7185         /* Here, the existing list is non-empty. The current max entry in the
7186          * list is generally the first value not in the set, except when the
7187          * set extends to the end of permissible values, in which case it is
7188          * the first entry in that final set, and so this call is an attempt to
7189          * append out-of-order */
7190
7191         UV final_element = len - 1;
7192         array = invlist_array(invlist);
7193         if (array[final_element] > start
7194             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7195         {
7196             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7197                        array[final_element], start,
7198                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7199         }
7200
7201         /* Here, it is a legal append.  If the new range begins with the first
7202          * value not in the set, it is extending the set, so the new first
7203          * value not in the set is one greater than the newly extended range.
7204          * */
7205         if (array[final_element] == start) {
7206             if (end != UV_MAX) {
7207                 array[final_element] = end + 1;
7208             }
7209             else {
7210                 /* But if the end is the maximum representable on the machine,
7211                  * just let the range that this would extend to have no end */
7212                 invlist_set_len(invlist, len - 1);
7213             }
7214             return;
7215         }
7216     }
7217
7218     /* Here the new range doesn't extend any existing set.  Add it */
7219
7220     len += 2;   /* Includes an element each for the start and end of range */
7221
7222     /* If overflows the existing space, extend, which may cause the array to be
7223      * moved */
7224     if (max < len) {
7225         invlist_extend(invlist, len);
7226         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7227                                            failure in invlist_array() */
7228         array = invlist_array(invlist);
7229     }
7230     else {
7231         invlist_set_len(invlist, len);
7232     }
7233
7234     /* The next item on the list starts the range, the one after that is
7235      * one past the new range.  */
7236     array[len - 2] = start;
7237     if (end != UV_MAX) {
7238         array[len - 1] = end + 1;
7239     }
7240     else {
7241         /* But if the end is the maximum representable on the machine, just let
7242          * the range have no end */
7243         invlist_set_len(invlist, len - 1);
7244     }
7245 }
7246
7247 #ifndef PERL_IN_XSUB_RE
7248
7249 IV
7250 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7251 {
7252     /* Searches the inversion list for the entry that contains the input code
7253      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7254      * return value is the index into the list's array of the range that
7255      * contains <cp> */
7256
7257     IV low = 0;
7258     IV mid;
7259     IV high = _invlist_len(invlist);
7260     const IV highest_element = high - 1;
7261     const UV* array;
7262
7263     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7264
7265     /* If list is empty, return failure. */
7266     if (high == 0) {
7267         return -1;
7268     }
7269
7270     /* (We can't get the array unless we know the list is non-empty) */
7271     array = invlist_array(invlist);
7272
7273     mid = invlist_previous_index(invlist);
7274     assert(mid >=0 && mid <= highest_element);
7275
7276     /* <mid> contains the cache of the result of the previous call to this
7277      * function (0 the first time).  See if this call is for the same result,
7278      * or if it is for mid-1.  This is under the theory that calls to this
7279      * function will often be for related code points that are near each other.
7280      * And benchmarks show that caching gives better results.  We also test
7281      * here if the code point is within the bounds of the list.  These tests
7282      * replace others that would have had to be made anyway to make sure that
7283      * the array bounds were not exceeded, and these give us extra information
7284      * at the same time */
7285     if (cp >= array[mid]) {
7286         if (cp >= array[highest_element]) {
7287             return highest_element;
7288         }
7289
7290         /* Here, array[mid] <= cp < array[highest_element].  This means that
7291          * the final element is not the answer, so can exclude it; it also
7292          * means that <mid> is not the final element, so can refer to 'mid + 1'
7293          * safely */
7294         if (cp < array[mid + 1]) {
7295             return mid;
7296         }
7297         high--;
7298         low = mid + 1;
7299     }
7300     else { /* cp < aray[mid] */
7301         if (cp < array[0]) { /* Fail if outside the array */
7302             return -1;
7303         }
7304         high = mid;
7305         if (cp >= array[mid - 1]) {
7306             goto found_entry;
7307         }
7308     }
7309
7310     /* Binary search.  What we are looking for is <i> such that
7311      *  array[i] <= cp < array[i+1]
7312      * The loop below converges on the i+1.  Note that there may not be an
7313      * (i+1)th element in the array, and things work nonetheless */
7314     while (low < high) {
7315         mid = (low + high) / 2;
7316         assert(mid <= highest_element);
7317         if (array[mid] <= cp) { /* cp >= array[mid] */
7318             low = mid + 1;
7319
7320             /* We could do this extra test to exit the loop early.
7321             if (cp < array[low]) {
7322                 return mid;
7323             }
7324             */
7325         }
7326         else { /* cp < array[mid] */
7327             high = mid;
7328         }
7329     }
7330
7331   found_entry:
7332     high--;
7333     invlist_set_previous_index(invlist, high);
7334     return high;
7335 }
7336
7337 void
7338 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7339 {
7340     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7341      * but is used when the swash has an inversion list.  This makes this much
7342      * faster, as it uses a binary search instead of a linear one.  This is
7343      * intimately tied to that function, and perhaps should be in utf8.c,
7344      * except it is intimately tied to inversion lists as well.  It assumes
7345      * that <swatch> is all 0's on input */
7346
7347     UV current = start;
7348     const IV len = _invlist_len(invlist);
7349     IV i;
7350     const UV * array;
7351
7352     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7353
7354     if (len == 0) { /* Empty inversion list */
7355         return;
7356     }
7357
7358     array = invlist_array(invlist);
7359
7360     /* Find which element it is */
7361     i = _invlist_search(invlist, start);
7362
7363     /* We populate from <start> to <end> */
7364     while (current < end) {
7365         UV upper;
7366
7367         /* The inversion list gives the results for every possible code point
7368          * after the first one in the list.  Only those ranges whose index is
7369          * even are ones that the inversion list matches.  For the odd ones,
7370          * and if the initial code point is not in the list, we have to skip
7371          * forward to the next element */
7372         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7373             i++;
7374             if (i >= len) { /* Finished if beyond the end of the array */
7375                 return;
7376             }
7377             current = array[i];
7378             if (current >= end) {   /* Finished if beyond the end of what we
7379                                        are populating */
7380                 if (LIKELY(end < UV_MAX)) {
7381                     return;
7382                 }
7383
7384                 /* We get here when the upper bound is the maximum
7385                  * representable on the machine, and we are looking for just
7386                  * that code point.  Have to special case it */
7387                 i = len;
7388                 goto join_end_of_list;
7389             }
7390         }
7391         assert(current >= start);
7392
7393         /* The current range ends one below the next one, except don't go past
7394          * <end> */
7395         i++;
7396         upper = (i < len && array[i] < end) ? array[i] : end;
7397
7398         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7399          * for each code point in it */
7400         for (; current < upper; current++) {
7401             const STRLEN offset = (STRLEN)(current - start);
7402             swatch[offset >> 3] |= 1 << (offset & 7);
7403         }
7404
7405     join_end_of_list:
7406
7407         /* Quit if at the end of the list */
7408         if (i >= len) {
7409
7410             /* But first, have to deal with the highest possible code point on
7411              * the platform.  The previous code assumes that <end> is one
7412              * beyond where we want to populate, but that is impossible at the
7413              * platform's infinity, so have to handle it specially */
7414             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7415             {
7416                 const STRLEN offset = (STRLEN)(end - start);
7417                 swatch[offset >> 3] |= 1 << (offset & 7);
7418             }
7419             return;
7420         }
7421
7422         /* Advance to the next range, which will be for code points not in the
7423          * inversion list */
7424         current = array[i];
7425     }
7426
7427     return;
7428 }
7429
7430 void
7431 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7432 {
7433     /* Take the union of two inversion lists and point <output> to it.  *output
7434      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7435      * the reference count to that list will be decremented.  The first list,
7436      * <a>, may be NULL, in which case a copy of the second list is returned.
7437      * If <complement_b> is TRUE, the union is taken of the complement
7438      * (inversion) of <b> instead of b itself.
7439      *
7440      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7441      * Richard Gillam, published by Addison-Wesley, and explained at some
7442      * length there.  The preface says to incorporate its examples into your
7443      * code at your own risk.
7444      *
7445      * The algorithm is like a merge sort.
7446      *
7447      * XXX A potential performance improvement is to keep track as we go along
7448      * if only one of the inputs contributes to the result, meaning the other
7449      * is a subset of that one.  In that case, we can skip the final copy and
7450      * return the larger of the input lists, but then outside code might need
7451      * to keep track of whether to free the input list or not */
7452
7453     UV* array_a;    /* a's array */
7454     UV* array_b;
7455     UV len_a;       /* length of a's array */
7456     UV len_b;
7457
7458     SV* u;                      /* the resulting union */
7459     UV* array_u;
7460     UV len_u;
7461
7462     UV i_a = 0;             /* current index into a's array */
7463     UV i_b = 0;
7464     UV i_u = 0;
7465
7466     /* running count, as explained in the algorithm source book; items are
7467      * stopped accumulating and are output when the count changes to/from 0.
7468      * The count is incremented when we start a range that's in the set, and
7469      * decremented when we start a range that's not in the set.  So its range
7470      * is 0 to 2.  Only when the count is zero is something not in the set.
7471      */
7472     UV count = 0;
7473
7474     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7475     assert(a != b);
7476
7477     /* If either one is empty, the union is the other one */
7478     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7479         if (*output == a) {
7480             if (a != NULL) {
7481                 SvREFCNT_dec_NN(a);
7482             }
7483         }
7484         if (*output != b) {
7485             *output = invlist_clone(b);
7486             if (complement_b) {
7487                 _invlist_invert(*output);
7488             }
7489         } /* else *output already = b; */
7490         return;
7491     }
7492     else if ((len_b = _invlist_len(b)) == 0) {
7493         if (*output == b) {
7494             SvREFCNT_dec_NN(b);
7495         }
7496
7497         /* The complement of an empty list is a list that has everything in it,
7498          * so the union with <a> includes everything too */
7499         if (complement_b) {
7500             if (a == *output) {
7501                 SvREFCNT_dec_NN(a);
7502             }
7503             *output = _new_invlist(1);
7504             _append_range_to_invlist(*output, 0, UV_MAX);
7505         }
7506         else if (*output != a) {
7507             *output = invlist_clone(a);
7508         }
7509         /* else *output already = a; */
7510         return;
7511     }
7512
7513     /* Here both lists exist and are non-empty */
7514     array_a = invlist_array(a);
7515     array_b = invlist_array(b);
7516
7517     /* If are to take the union of 'a' with the complement of b, set it
7518      * up so are looking at b's complement. */
7519     if (complement_b) {
7520
7521         /* To complement, we invert: if the first element is 0, remove it.  To
7522          * do this, we just pretend the array starts one later, and clear the
7523          * flag as we don't have to do anything else later */
7524         if (array_b[0] == 0) {
7525             array_b++;
7526             len_b--;
7527             complement_b = FALSE;
7528         }
7529         else {
7530
7531             /* But if the first element is not zero, we unshift a 0 before the
7532              * array.  The data structure reserves a space for that 0 (which
7533              * should be a '1' right now), so physical shifting is unneeded,
7534              * but temporarily change that element to 0.  Before exiting the
7535              * routine, we must restore the element to '1' */
7536             array_b--;
7537             len_b++;
7538             array_b[0] = 0;
7539         }
7540     }
7541
7542     /* Size the union for the worst case: that the sets are completely
7543      * disjoint */
7544     u = _new_invlist(len_a + len_b);
7545
7546     /* Will contain U+0000 if either component does */
7547     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7548                                       || (len_b > 0 && array_b[0] == 0));
7549
7550     /* Go through each list item by item, stopping when exhausted one of
7551      * them */
7552     while (i_a < len_a && i_b < len_b) {
7553         UV cp;      /* The element to potentially add to the union's array */
7554         bool cp_in_set;   /* is it in the the input list's set or not */
7555
7556         /* We need to take one or the other of the two inputs for the union.
7557          * Since we are merging two sorted lists, we take the smaller of the
7558          * next items.  In case of a tie, we take the one that is in its set
7559          * first.  If we took one not in the set first, it would decrement the
7560          * count, possibly to 0 which would cause it to be output as ending the
7561          * range, and the next time through we would take the same number, and
7562          * output it again as beginning the next range.  By doing it the
7563          * opposite way, there is no possibility that the count will be
7564          * momentarily decremented to 0, and thus the two adjoining ranges will
7565          * be seamlessly merged.  (In a tie and both are in the set or both not
7566          * in the set, it doesn't matter which we take first.) */
7567         if (array_a[i_a] < array_b[i_b]
7568             || (array_a[i_a] == array_b[i_b]
7569                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7570         {
7571             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7572             cp= array_a[i_a++];
7573         }
7574         else {
7575             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7576             cp = array_b[i_b++];
7577         }
7578
7579         /* Here, have chosen which of the two inputs to look at.  Only output
7580          * if the running count changes to/from 0, which marks the
7581          * beginning/end of a range in that's in the set */
7582         if (cp_in_set) {
7583             if (count == 0) {
7584                 array_u[i_u++] = cp;
7585             }
7586             count++;
7587         }
7588         else {
7589             count--;
7590             if (count == 0) {
7591                 array_u[i_u++] = cp;
7592             }
7593         }
7594     }
7595
7596     /* Here, we are finished going through at least one of the lists, which
7597      * means there is something remaining in at most one.  We check if the list
7598      * that hasn't been exhausted is positioned such that we are in the middle
7599      * of a range in its set or not.  (i_a and i_b point to the element beyond
7600      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7601      * is potentially more to output.
7602      * There are four cases:
7603      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7604      *     in the union is entirely from the non-exhausted set.
7605      *  2) Both were in their sets, count is 2.  Nothing further should
7606      *     be output, as everything that remains will be in the exhausted
7607      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7608      *     that
7609      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7610      *     Nothing further should be output because the union includes
7611      *     everything from the exhausted set.  Not decrementing ensures that.
7612      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7613      *     decrementing to 0 insures that we look at the remainder of the
7614      *     non-exhausted set */
7615     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7616         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7617     {
7618         count--;
7619     }
7620
7621     /* The final length is what we've output so far, plus what else is about to
7622      * be output.  (If 'count' is non-zero, then the input list we exhausted
7623      * has everything remaining up to the machine's limit in its set, and hence
7624      * in the union, so there will be no further output. */
7625     len_u = i_u;
7626     if (count == 0) {
7627         /* At most one of the subexpressions will be non-zero */
7628         len_u += (len_a - i_a) + (len_b - i_b);
7629     }
7630
7631     /* Set result to final length, which can change the pointer to array_u, so
7632      * re-find it */
7633     if (len_u != _invlist_len(u)) {
7634         invlist_set_len(u, len_u);
7635         invlist_trim(u);
7636         array_u = invlist_array(u);
7637     }
7638
7639     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7640      * the other) ended with everything above it not in its set.  That means
7641      * that the remaining part of the union is precisely the same as the
7642      * non-exhausted list, so can just copy it unchanged.  (If both list were
7643      * exhausted at the same time, then the operations below will be both 0.)
7644      */
7645     if (count == 0) {
7646         IV copy_count; /* At most one will have a non-zero copy count */
7647         if ((copy_count = len_a - i_a) > 0) {
7648             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7649         }
7650         else if ((copy_count = len_b - i_b) > 0) {
7651             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7652         }
7653     }
7654
7655     /* If we've changed b, restore it */
7656     if (complement_b) {
7657         array_b[0] = 1;
7658     }
7659
7660     /*  We may be removing a reference to one of the inputs */
7661     if (a == *output || b == *output) {
7662         assert(! invlist_is_iterating(*output));
7663         SvREFCNT_dec_NN(*output);
7664     }
7665
7666     *output = u;
7667     return;
7668 }
7669
7670 void
7671 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7672 {
7673     /* Take the intersection of two inversion lists and point <i> to it.  *i
7674      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7675      * the reference count to that list will be decremented.
7676      * If <complement_b> is TRUE, the result will be the intersection of <a>
7677      * and the complement (or inversion) of <b> instead of <b> directly.
7678      *
7679      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7680      * Richard Gillam, published by Addison-Wesley, and explained at some
7681      * length there.  The preface says to incorporate its examples into your
7682      * code at your own risk.  In fact, it had bugs
7683      *
7684      * The algorithm is like a merge sort, and is essentially the same as the
7685      * union above
7686      */
7687
7688     UV* array_a;                /* a's array */
7689     UV* array_b;
7690     UV len_a;   /* length of a's array */
7691     UV len_b;
7692
7693     SV* r;                   /* the resulting intersection */
7694     UV* array_r;
7695     UV len_r;
7696
7697     UV i_a = 0;             /* current index into a's array */
7698     UV i_b = 0;
7699     UV i_r = 0;
7700
7701     /* running count, as explained in the algorithm source book; items are
7702      * stopped accumulating and are output when the count changes to/from 2.
7703      * The count is incremented when we start a range that's in the set, and
7704      * decremented when we start a range that's not in the set.  So its range
7705      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7706      */
7707     UV count = 0;
7708
7709     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7710     assert(a != b);
7711
7712     /* Special case if either one is empty */
7713     len_a = _invlist_len(a);
7714     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7715
7716         if (len_a != 0 && complement_b) {
7717
7718             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7719              * be empty.  Here, also we are using 'b's complement, which hence
7720              * must be every possible code point.  Thus the intersection is
7721              * simply 'a'. */
7722             if (*i != a) {
7723                 *i = invlist_clone(a);
7724
7725                 if (*i == b) {
7726                     SvREFCNT_dec_NN(b);
7727                 }
7728             }
7729             /* else *i is already 'a' */
7730             return;
7731         }
7732
7733         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7734          * intersection must be empty */
7735         if (*i == a) {
7736             SvREFCNT_dec_NN(a);
7737         }
7738         else if (*i == b) {
7739             SvREFCNT_dec_NN(b);
7740         }
7741         *i = _new_invlist(0);
7742         return;
7743     }
7744
7745     /* Here both lists exist and are non-empty */
7746     array_a = invlist_array(a);
7747     array_b = invlist_array(b);
7748
7749     /* If are to take the intersection of 'a' with the complement of b, set it
7750      * up so are looking at b's complement. */
7751     if (complement_b) {
7752
7753         /* To complement, we invert: if the first element is 0, remove it.  To
7754          * do this, we just pretend the array starts one later, and clear the
7755          * flag as we don't have to do anything else later */
7756         if (array_b[0] == 0) {
7757             array_b++;
7758             len_b--;
7759             complement_b = FALSE;
7760         }
7761         else {
7762
7763             /* But if the first element is not zero, we unshift a 0 before the
7764              * array.  The data structure reserves a space for that 0 (which
7765              * should be a '1' right now), so physical shifting is unneeded,
7766              * but temporarily change that element to 0.  Before exiting the
7767              * routine, we must restore the element to '1' */
7768             array_b--;
7769             len_b++;
7770             array_b[0] = 0;
7771         }
7772     }
7773
7774     /* Size the intersection for the worst case: that the intersection ends up
7775      * fragmenting everything to be completely disjoint */
7776     r= _new_invlist(len_a + len_b);
7777
7778     /* Will contain U+0000 iff both components do */
7779     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7780                                      && len_b > 0 && array_b[0] == 0);
7781
7782     /* Go through each list item by item, stopping when exhausted one of
7783      * them */
7784     while (i_a < len_a && i_b < len_b) {
7785         UV cp;      /* The element to potentially add to the intersection's
7786                        array */
7787         bool cp_in_set; /* Is it in the input list's set or not */
7788
7789         /* We need to take one or the other of the two inputs for the
7790          * intersection.  Since we are merging two sorted lists, we take the
7791          * smaller of the next items.  In case of a tie, we take the one that
7792          * is not in its set first (a difference from the union algorithm).  If
7793          * we took one in the set first, it would increment the count, possibly
7794          * to 2 which would cause it to be output as starting a range in the
7795          * intersection, and the next time through we would take that same
7796          * number, and output it again as ending the set.  By doing it the
7797          * opposite of this, there is no possibility that the count will be
7798          * momentarily incremented to 2.  (In a tie and both are in the set or
7799          * both not in the set, it doesn't matter which we take first.) */
7800         if (array_a[i_a] < array_b[i_b]
7801             || (array_a[i_a] == array_b[i_b]
7802                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7803         {
7804             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7805             cp= array_a[i_a++];
7806         }
7807         else {
7808             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7809             cp= array_b[i_b++];
7810         }
7811
7812         /* Here, have chosen which of the two inputs to look at.  Only output
7813          * if the running count changes to/from 2, which marks the
7814          * beginning/end of a range that's in the intersection */
7815         if (cp_in_set) {
7816             count++;
7817             if (count == 2) {
7818                 array_r[i_r++] = cp;
7819             }
7820         }
7821         else {
7822             if (count == 2) {
7823                 array_r[i_r++] = cp;
7824             }
7825             count--;
7826         }
7827     }
7828
7829     /* Here, we are finished going through at least one of the lists, which
7830      * means there is something remaining in at most one.  We check if the list
7831      * that has been exhausted is positioned such that we are in the middle
7832      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7833      * the ones we care about.)  There are four cases:
7834      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7835      *     nothing left in the intersection.
7836      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7837      *     above 2.  What should be output is exactly that which is in the
7838      *     non-exhausted set, as everything it has is also in the intersection
7839      *     set, and everything it doesn't have can't be in the intersection
7840      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7841      *     gets incremented to 2.  Like the previous case, the intersection is
7842      *     everything that remains in the non-exhausted set.
7843      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7844      *     remains 1.  And the intersection has nothing more. */
7845     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7846         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7847     {
7848         count++;
7849     }
7850
7851     /* The final length is what we've output so far plus what else is in the
7852      * intersection.  At most one of the subexpressions below will be non-zero */
7853     len_r = i_r;
7854     if (count >= 2) {
7855         len_r += (len_a - i_a) + (len_b - i_b);
7856     }
7857
7858     /* Set result to final length, which can change the pointer to array_r, so
7859      * re-find it */
7860     if (len_r != _invlist_len(r)) {
7861         invlist_set_len(r, len_r);
7862         invlist_trim(r);
7863         array_r = invlist_array(r);
7864     }
7865
7866     /* Finish outputting any remaining */
7867     if (count >= 2) { /* At most one will have a non-zero copy count */
7868         IV copy_count;
7869         if ((copy_count = len_a - i_a) > 0) {
7870             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7871         }
7872         else if ((copy_count = len_b - i_b) > 0) {
7873             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7874         }
7875     }
7876
7877     /* If we've changed b, restore it */
7878     if (complement_b) {
7879         array_b[0] = 1;
7880     }
7881
7882     /*  We may be removing a reference to one of the inputs */
7883     if (a == *i || b == *i) {
7884         assert(! invlist_is_iterating(*i));
7885         SvREFCNT_dec_NN(*i);
7886     }
7887
7888     *i = r;
7889     return;
7890 }
7891
7892 SV*
7893 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7894 {
7895     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7896      * set.  A pointer to the inversion list is returned.  This may actually be
7897      * a new list, in which case the passed in one has been destroyed.  The
7898      * passed in inversion list can be NULL, in which case a new one is created
7899      * with just the one range in it */
7900
7901     SV* range_invlist;
7902     UV len;
7903
7904     if (invlist == NULL) {
7905         invlist = _new_invlist(2);
7906         len = 0;
7907     }
7908     else {
7909         len = _invlist_len(invlist);
7910     }
7911
7912     /* If comes after the final entry actually in the list, can just append it
7913      * to the end, */
7914     if (len == 0
7915         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7916             && start >= invlist_array(invlist)[len - 1]))
7917     {
7918         _append_range_to_invlist(invlist, start, end);
7919         return invlist;
7920     }
7921
7922     /* Here, can't just append things, create and return a new inversion list
7923      * which is the union of this range and the existing inversion list */
7924     range_invlist = _new_invlist(2);
7925     _append_range_to_invlist(range_invlist, start, end);
7926
7927     _invlist_union(invlist, range_invlist, &invlist);
7928
7929     /* The temporary can be freed */
7930     SvREFCNT_dec_NN(range_invlist);
7931
7932     return invlist;
7933 }
7934
7935 #endif
7936
7937 PERL_STATIC_INLINE SV*
7938 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7939     return _add_range_to_invlist(invlist, cp, cp);
7940 }
7941
7942 #ifndef PERL_IN_XSUB_RE
7943 void
7944 Perl__invlist_invert(pTHX_ SV* const invlist)
7945 {
7946     /* Complement the input inversion list.  This adds a 0 if the list didn't
7947      * have a zero; removes it otherwise.  As described above, the data
7948      * structure is set up so that this is very efficient */
7949
7950     UV* len_pos = _get_invlist_len_addr(invlist);
7951
7952     PERL_ARGS_ASSERT__INVLIST_INVERT;
7953
7954     assert(! invlist_is_iterating(invlist));
7955
7956     /* The inverse of matching nothing is matching everything */
7957     if (*len_pos == 0) {
7958         _append_range_to_invlist(invlist, 0, UV_MAX);
7959         return;
7960     }
7961
7962     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7963      * zero element was a 0, so it is being removed, so the length decrements
7964      * by 1; and vice-versa.  SvCUR is unaffected */
7965     if (*get_invlist_zero_addr(invlist) ^= 1) {
7966         (*len_pos)--;
7967     }
7968     else {
7969         (*len_pos)++;
7970     }
7971 }
7972
7973 void
7974 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7975 {
7976     /* Complement the input inversion list (which must be a Unicode property,
7977      * all of which don't match above the Unicode maximum code point.)  And
7978      * Perl has chosen to not have the inversion match above that either.  This
7979      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7980      */
7981
7982     UV len;
7983     UV* array;
7984
7985     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7986
7987     _invlist_invert(invlist);
7988
7989     len = _invlist_len(invlist);
7990
7991     if (len != 0) { /* If empty do nothing */
7992         array = invlist_array(invlist);
7993         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7994             /* Add 0x110000.  First, grow if necessary */
7995             len++;
7996             if (invlist_max(invlist) < len) {
7997                 invlist_extend(invlist, len);
7998                 array = invlist_array(invlist);
7999             }
8000             invlist_set_len(invlist, len);
8001             array[len - 1] = PERL_UNICODE_MAX + 1;
8002         }
8003         else {  /* Remove the 0x110000 */
8004             invlist_set_len(invlist, len - 1);
8005         }
8006     }
8007
8008     return;
8009 }
8010 #endif
8011
8012 PERL_STATIC_INLINE SV*
8013 S_invlist_clone(pTHX_ SV* const invlist)
8014 {
8015
8016     /* Return a new inversion list that is a copy of the input one, which is
8017      * unchanged */
8018
8019     /* Need to allocate extra space to accommodate Perl's addition of a
8020      * trailing NUL to SvPV's, since it thinks they are always strings */
8021     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8022     STRLEN length = SvCUR(invlist);
8023
8024     PERL_ARGS_ASSERT_INVLIST_CLONE;
8025
8026     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8027     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8028
8029     return new_invlist;
8030 }
8031
8032 PERL_STATIC_INLINE UV*
8033 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8034 {
8035     /* Return the address of the UV that contains the current iteration
8036      * position */
8037
8038     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8039
8040     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8041 }
8042
8043 PERL_STATIC_INLINE UV*
8044 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8045 {
8046     /* Return the address of the UV that contains the version id. */
8047
8048     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8049
8050     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8051 }
8052
8053 PERL_STATIC_INLINE void
8054 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8055 {
8056     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8057
8058     *get_invlist_iter_addr(invlist) = 0;
8059 }
8060
8061 PERL_STATIC_INLINE void
8062 S_invlist_iterfinish(pTHX_ SV* invlist)
8063 {
8064     /* Terminate iterator for invlist.  This is to catch development errors.
8065      * Any iteration that is interrupted before completed should call this
8066      * function.  Functions that add code points anywhere else but to the end
8067      * of an inversion list assert that they are not in the middle of an
8068      * iteration.  If they were, the addition would make the iteration
8069      * problematical: if the iteration hadn't reached the place where things
8070      * were being added, it would be ok */
8071
8072     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8073
8074     *get_invlist_iter_addr(invlist) = UV_MAX;
8075 }
8076
8077 STATIC bool
8078 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8079 {
8080     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8081      * This call sets in <*start> and <*end>, the next range in <invlist>.
8082      * Returns <TRUE> if successful and the next call will return the next
8083      * range; <FALSE> if was already at the end of the list.  If the latter,
8084      * <*start> and <*end> are unchanged, and the next call to this function
8085      * will start over at the beginning of the list */
8086
8087     UV* pos = get_invlist_iter_addr(invlist);
8088     UV len = _invlist_len(invlist);
8089     UV *array;
8090
8091     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8092
8093     if (*pos >= len) {
8094         *pos = UV_MAX;  /* Force iterinit() to be required next time */
8095         return FALSE;
8096     }
8097
8098     array = invlist_array(invlist);
8099
8100     *start = array[(*pos)++];
8101
8102     if (*pos >= len) {
8103         *end = UV_MAX;
8104     }
8105     else {
8106         *end = array[(*pos)++] - 1;
8107     }
8108
8109     return TRUE;
8110 }
8111
8112 PERL_STATIC_INLINE bool
8113 S_invlist_is_iterating(pTHX_ SV* const invlist)
8114 {
8115     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8116
8117     return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8118 }
8119
8120 PERL_STATIC_INLINE UV
8121 S_invlist_highest(pTHX_ SV* const invlist)
8122 {
8123     /* Returns the highest code point that matches an inversion list.  This API
8124      * has an ambiguity, as it returns 0 under either the highest is actually
8125      * 0, or if the list is empty.  If this distinction matters to you, check
8126      * for emptiness before calling this function */
8127
8128     UV len = _invlist_len(invlist);
8129     UV *array;
8130
8131     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8132
8133     if (len == 0) {
8134         return 0;
8135     }
8136
8137     array = invlist_array(invlist);
8138
8139     /* The last element in the array in the inversion list always starts a
8140      * range that goes to infinity.  That range may be for code points that are
8141      * matched in the inversion list, or it may be for ones that aren't
8142      * matched.  In the latter case, the highest code point in the set is one
8143      * less than the beginning of this range; otherwise it is the final element
8144      * of this range: infinity */
8145     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8146            ? UV_MAX
8147            : array[len - 1] - 1;
8148 }
8149
8150 #ifndef PERL_IN_XSUB_RE
8151 SV *
8152 Perl__invlist_contents(pTHX_ SV* const invlist)
8153 {
8154     /* Get the contents of an inversion list into a string SV so that they can
8155      * be printed out.  It uses the format traditionally done for debug tracing
8156      */
8157
8158     UV start, end;
8159     SV* output = newSVpvs("\n");
8160
8161     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8162
8163     assert(! invlist_is_iterating(invlist));
8164
8165     invlist_iterinit(invlist);
8166     while (invlist_iternext(invlist, &start, &end)) {
8167         if (end == UV_MAX) {
8168             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8169         }
8170         else if (end != start) {
8171             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8172                     start,       end);
8173         }
8174         else {
8175             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8176         }
8177     }
8178
8179     return output;
8180 }
8181 #endif
8182
8183 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8184 void
8185 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8186 {
8187     /* Dumps out the ranges in an inversion list.  The string 'header'
8188      * if present is output on a line before the first range */
8189
8190     UV start, end;
8191
8192     PERL_ARGS_ASSERT__INVLIST_DUMP;
8193
8194     if (header && strlen(header)) {
8195         PerlIO_printf(Perl_debug_log, "%s\n", header);
8196     }
8197     if (invlist_is_iterating(invlist)) {
8198         PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8199         return;
8200     }
8201
8202     invlist_iterinit(invlist);
8203     while (invlist_iternext(invlist, &start, &end)) {
8204         if (end == UV_MAX) {
8205             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8206         }
8207         else if (end != start) {
8208             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8209                                                  start,         end);
8210         }
8211         else {
8212             PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8213         }
8214     }
8215 }
8216 #endif
8217
8218 #if 0
8219 bool
8220 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8221 {
8222     /* Return a boolean as to if the two passed in inversion lists are
8223      * identical.  The final argument, if TRUE, says to take the complement of
8224      * the second inversion list before doing the comparison */
8225
8226     UV* array_a = invlist_array(a);
8227     UV* array_b = invlist_array(b);
8228     UV len_a = _invlist_len(a);
8229     UV len_b = _invlist_len(b);
8230
8231     UV i = 0;               /* current index into the arrays */
8232     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8233
8234     PERL_ARGS_ASSERT__INVLISTEQ;
8235
8236     /* If are to compare 'a' with the complement of b, set it
8237      * up so are looking at b's complement. */
8238     if (complement_b) {
8239
8240         /* The complement of nothing is everything, so <a> would have to have
8241          * just one element, starting at zero (ending at infinity) */
8242         if (len_b == 0) {
8243             return (len_a == 1 && array_a[0] == 0);
8244         }
8245         else if (array_b[0] == 0) {
8246
8247             /* Otherwise, to complement, we invert.  Here, the first element is
8248              * 0, just remove it.  To do this, we just pretend the array starts
8249              * one later, and clear the flag as we don't have to do anything
8250              * else later */
8251
8252             array_b++;
8253             len_b--;
8254             complement_b = FALSE;
8255         }
8256         else {
8257
8258             /* But if the first element is not zero, we unshift a 0 before the
8259              * array.  The data structure reserves a space for that 0 (which
8260              * should be a '1' right now), so physical shifting is unneeded,
8261              * but temporarily change that element to 0.  Before exiting the
8262              * routine, we must restore the element to '1' */
8263             array_b--;
8264             len_b++;
8265             array_b[0] = 0;
8266         }
8267     }
8268
8269     /* Make sure that the lengths are the same, as well as the final element
8270      * before looping through the remainder.  (Thus we test the length, final,
8271      * and first elements right off the bat) */
8272     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8273         retval = FALSE;
8274     }
8275     else for (i = 0; i < len_a - 1; i++) {
8276         if (array_a[i] != array_b[i]) {
8277             retval = FALSE;
8278             break;
8279         }
8280     }
8281
8282     if (complement_b) {
8283         array_b[0] = 1;
8284     }
8285     return retval;
8286 }
8287 #endif
8288
8289 #undef HEADER_LENGTH
8290 #undef INVLIST_INITIAL_LENGTH
8291 #undef TO_INTERNAL_SIZE
8292 #undef FROM_INTERNAL_SIZE
8293 #undef INVLIST_LEN_OFFSET
8294 #undef INVLIST_ZERO_OFFSET
8295 #undef INVLIST_ITER_OFFSET
8296 #undef INVLIST_VERSION_ID
8297 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8298
8299 /* End of inversion list object */
8300
8301 STATIC void
8302 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8303 {
8304     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8305      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8306      * should point to the first flag; it is updated on output to point to the
8307      * final ')' or ':'.  There needs to be at least one flag, or this will
8308      * abort */
8309
8310     /* for (?g), (?gc), and (?o) warnings; warning
8311        about (?c) will warn about (?g) -- japhy    */
8312
8313 #define WASTED_O  0x01
8314 #define WASTED_G  0x02
8315 #define WASTED_C  0x04
8316 #define WASTED_GC (0x02|0x04)
8317     I32 wastedflags = 0x00;
8318     U32 posflags = 0, negflags = 0;
8319     U32 *flagsp = &posflags;
8320     char has_charset_modifier = '\0';
8321     regex_charset cs;
8322     bool has_use_defaults = FALSE;
8323     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8324
8325     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8326
8327     /* '^' as an initial flag sets certain defaults */
8328     if (UCHARAT(RExC_parse) == '^') {
8329         RExC_parse++;
8330         has_use_defaults = TRUE;
8331         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8332         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8333                                         ? REGEX_UNICODE_CHARSET
8334                                         : REGEX_DEPENDS_CHARSET);
8335     }
8336
8337     cs = get_regex_charset(RExC_flags);
8338     if (cs == REGEX_DEPENDS_CHARSET
8339         && (RExC_utf8 || RExC_uni_semantics))
8340     {
8341         cs = REGEX_UNICODE_CHARSET;
8342     }
8343
8344     while (*RExC_parse) {
8345         /* && strchr("iogcmsx", *RExC_parse) */
8346         /* (?g), (?gc) and (?o) are useless here
8347            and must be globally applied -- japhy */
8348         switch (*RExC_parse) {
8349
8350             /* Code for the imsx flags */
8351             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8352
8353             case LOCALE_PAT_MOD:
8354                 if (has_charset_modifier) {
8355                     goto excess_modifier;
8356                 }
8357                 else if (flagsp == &negflags) {
8358                     goto neg_modifier;
8359                 }
8360                 cs = REGEX_LOCALE_CHARSET;
8361                 has_charset_modifier = LOCALE_PAT_MOD;
8362                 RExC_contains_locale = 1;
8363                 break;
8364             case UNICODE_PAT_MOD:
8365                 if (has_charset_modifier) {
8366                     goto excess_modifier;
8367                 }
8368                 else if (flagsp == &negflags) {
8369                     goto neg_modifier;
8370                 }
8371                 cs = REGEX_UNICODE_CHARSET;
8372                 has_charset_modifier = UNICODE_PAT_MOD;
8373                 break;
8374             case ASCII_RESTRICT_PAT_MOD:
8375                 if (flagsp == &negflags) {
8376                     goto neg_modifier;
8377                 }
8378                 if (has_charset_modifier) {
8379                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8380                         goto excess_modifier;
8381                     }
8382                     /* Doubled modifier implies more restricted */
8383                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8384                 }
8385                 else {
8386                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8387                 }
8388                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8389                 break;
8390             case DEPENDS_PAT_MOD:
8391                 if (has_use_defaults) {
8392                     goto fail_modifiers;
8393                 }
8394                 else if (flagsp == &negflags) {
8395                     goto neg_modifier;
8396                 }
8397                 else if (has_charset_modifier) {
8398                     goto excess_modifier;
8399                 }
8400
8401                 /* The dual charset means unicode semantics if the
8402                  * pattern (or target, not known until runtime) are
8403                  * utf8, or something in the pattern indicates unicode
8404                  * semantics */
8405                 cs = (RExC_utf8 || RExC_uni_semantics)
8406                      ? REGEX_UNICODE_CHARSET
8407                      : REGEX_DEPENDS_CHARSET;
8408                 has_charset_modifier = DEPENDS_PAT_MOD;
8409                 break;
8410             excess_modifier:
8411                 RExC_parse++;
8412                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8413                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8414                 }
8415                 else if (has_charset_modifier == *(RExC_parse - 1)) {
8416                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8417                 }
8418                 else {
8419                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8420                 }
8421                 /*NOTREACHED*/
8422             neg_modifier:
8423                 RExC_parse++;
8424                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8425                 /*NOTREACHED*/
8426             case ONCE_PAT_MOD: /* 'o' */
8427             case GLOBAL_PAT_MOD: /* 'g' */
8428                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8429                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8430                     if (! (wastedflags & wflagbit) ) {
8431                         wastedflags |= wflagbit;
8432                         vWARN5(
8433                             RExC_parse + 1,
8434                             "Useless (%s%c) - %suse /%c modifier",
8435                             flagsp == &negflags ? "?-" : "?",
8436                             *RExC_parse,
8437                             flagsp == &negflags ? "don't " : "",
8438                             *RExC_parse
8439                         );
8440                     }
8441                 }
8442                 break;
8443
8444             case CONTINUE_PAT_MOD: /* 'c' */
8445                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8446                     if (! (wastedflags & WASTED_C) ) {
8447                         wastedflags |= WASTED_GC;
8448                         vWARN3(
8449                             RExC_parse + 1,
8450                             "Useless (%sc) - %suse /gc modifier",
8451                             flagsp == &negflags ? "?-" : "?",
8452                             flagsp == &negflags ? "don't " : ""
8453                         );
8454                     }
8455                 }
8456                 break;
8457             case KEEPCOPY_PAT_MOD: /* 'p' */
8458                 if (flagsp == &negflags) {
8459                     if (SIZE_ONLY)
8460                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8461                 } else {
8462                     *flagsp |= RXf_PMf_KEEPCOPY;
8463                 }
8464                 break;
8465             case '-':
8466                 /* A flag is a default iff it is following a minus, so
8467                  * if there is a minus, it means will be trying to
8468                  * re-specify a default which is an error */
8469                 if (has_use_defaults || flagsp == &negflags) {
8470                     goto fail_modifiers;
8471                 }
8472                 flagsp = &negflags;
8473                 wastedflags = 0;  /* reset so (?g-c) warns twice */
8474                 break;
8475             case ':':
8476             case ')':
8477                 RExC_flags |= posflags;
8478                 RExC_flags &= ~negflags;
8479                 set_regex_charset(&RExC_flags, cs);
8480                 return;
8481                 /*NOTREACHED*/
8482             default:
8483             fail_modifiers:
8484                 RExC_parse++;
8485                 vFAIL3("Sequence (%.*s...) not recognized",
8486                        RExC_parse-seqstart, seqstart);
8487                 /*NOTREACHED*/
8488         }
8489
8490         ++RExC_parse;
8491     }
8492 }
8493
8494 /*
8495  - reg - regular expression, i.e. main body or parenthesized thing
8496  *
8497  * Caller must absorb opening parenthesis.
8498  *
8499  * Combining parenthesis handling with the base level of regular expression
8500  * is a trifle forced, but the need to tie the tails of the branches to what
8501  * follows makes it hard to avoid.
8502  */
8503 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8504 #ifdef DEBUGGING
8505 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8506 #else
8507 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8508 #endif
8509
8510 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8511    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8512    needs to be restarted.
8513    Otherwise would only return NULL if regbranch() returns NULL, which
8514    cannot happen.  */
8515 STATIC regnode *
8516 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8517     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8518 {
8519     dVAR;
8520     regnode *ret;               /* Will be the head of the group. */
8521     regnode *br;
8522     regnode *lastbr;
8523     regnode *ender = NULL;
8524     I32 parno = 0;
8525     I32 flags;
8526     U32 oregflags = RExC_flags;
8527     bool have_branch = 0;
8528     bool is_open = 0;
8529     I32 freeze_paren = 0;
8530     I32 after_freeze = 0;
8531
8532     char * parse_start = RExC_parse; /* MJD */
8533     char * const oregcomp_parse = RExC_parse;
8534
8535     GET_RE_DEBUG_FLAGS_DECL;
8536
8537     PERL_ARGS_ASSERT_REG;
8538     DEBUG_PARSE("reg ");
8539
8540     *flagp = 0;                         /* Tentatively. */
8541
8542
8543     /* Make an OPEN node, if parenthesized. */
8544     if (paren) {
8545         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8546             char *start_verb = RExC_parse;
8547             STRLEN verb_len = 0;
8548             char *start_arg = NULL;
8549             unsigned char op = 0;
8550             int argok = 1;
8551             int internal_argval = 0; /* internal_argval is only useful if !argok */
8552             while ( *RExC_parse && *RExC_parse != ')' ) {
8553                 if ( *RExC_parse == ':' ) {
8554                     start_arg = RExC_parse + 1;
8555                     break;
8556                 }
8557                 RExC_parse++;
8558             }
8559             ++start_verb;
8560             verb_len = RExC_parse - start_verb;
8561             if ( start_arg ) {
8562                 RExC_parse++;
8563                 while ( *RExC_parse && *RExC_parse != ')' )
8564                     RExC_parse++;
8565                 if ( *RExC_parse != ')' )
8566                     vFAIL("Unterminated verb pattern argument");
8567                 if ( RExC_parse == start_arg )
8568                     start_arg = NULL;
8569             } else {
8570                 if ( *RExC_parse != ')' )
8571                     vFAIL("Unterminated verb pattern");
8572             }
8573
8574             switch ( *start_verb ) {
8575             case 'A':  /* (*ACCEPT) */
8576                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8577                     op = ACCEPT;
8578                     internal_argval = RExC_nestroot;
8579                 }
8580                 break;
8581             case 'C':  /* (*COMMIT) */
8582                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8583                     op = COMMIT;
8584                 break;
8585             case 'F':  /* (*FAIL) */
8586                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8587                     op = OPFAIL;
8588                     argok = 0;
8589                 }
8590                 break;
8591             case ':':  /* (*:NAME) */
8592             case 'M':  /* (*MARK:NAME) */
8593                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8594                     op = MARKPOINT;
8595                     argok = -1;
8596                 }
8597                 break;
8598             case 'P':  /* (*PRUNE) */
8599                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8600                     op = PRUNE;
8601                 break;
8602             case 'S':   /* (*SKIP) */
8603                 if ( memEQs(start_verb,verb_len,"SKIP") )
8604                     op = SKIP;
8605                 break;
8606             case 'T':  /* (*THEN) */
8607                 /* [19:06] <TimToady> :: is then */
8608                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8609                     op = CUTGROUP;
8610                     RExC_seen |= REG_SEEN_CUTGROUP;
8611                 }
8612                 break;
8613             }
8614             if ( ! op ) {
8615                 RExC_parse++;
8616                 vFAIL3("Unknown verb pattern '%.*s'",
8617                     verb_len, start_verb);
8618             }
8619             if ( argok ) {
8620                 if ( start_arg && internal_argval ) {
8621                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8622                         verb_len, start_verb);
8623                 } else if ( argok < 0 && !start_arg ) {
8624                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8625                         verb_len, start_verb);
8626                 } else {
8627                     ret = reganode(pRExC_state, op, internal_argval);
8628                     if ( ! internal_argval && ! SIZE_ONLY ) {
8629                         if (start_arg) {
8630                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8631                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8632                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8633                             ret->flags = 0;
8634                         } else {
8635                             ret->flags = 1;
8636                         }
8637                     }
8638                 }
8639                 if (!internal_argval)
8640                     RExC_seen |= REG_SEEN_VERBARG;
8641             } else if ( start_arg ) {
8642                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8643                         verb_len, start_verb);
8644             } else {
8645                 ret = reg_node(pRExC_state, op);
8646             }
8647             nextchar(pRExC_state);
8648             return ret;
8649         } else
8650         if (*RExC_parse == '?') { /* (?...) */
8651             bool is_logical = 0;
8652             const char * const seqstart = RExC_parse;
8653
8654             RExC_parse++;
8655             paren = *RExC_parse++;
8656             ret = NULL;                 /* For look-ahead/behind. */
8657             switch (paren) {
8658
8659             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8660                 paren = *RExC_parse++;
8661                 if ( paren == '<')         /* (?P<...>) named capture */
8662                     goto named_capture;
8663                 else if (paren == '>') {   /* (?P>name) named recursion */
8664                     goto named_recursion;
8665                 }
8666                 else if (paren == '=') {   /* (?P=...)  named backref */
8667                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8668                        you change this make sure you change that */
8669                     char* name_start = RExC_parse;
8670                     U32 num = 0;
8671                     SV *sv_dat = reg_scan_name(pRExC_state,
8672                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8673                     if (RExC_parse == name_start || *RExC_parse != ')')
8674                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8675
8676                     if (!SIZE_ONLY) {
8677                         num = add_data( pRExC_state, 1, "S" );
8678                         RExC_rxi->data->data[num]=(void*)sv_dat;
8679                         SvREFCNT_inc_simple_void(sv_dat);
8680                     }
8681                     RExC_sawback = 1;
8682                     ret = reganode(pRExC_state,
8683                                    ((! FOLD)
8684                                      ? NREF
8685                                      : (ASCII_FOLD_RESTRICTED)
8686                                        ? NREFFA
8687                                        : (AT_LEAST_UNI_SEMANTICS)
8688                                          ? NREFFU
8689                                          : (LOC)
8690                                            ? NREFFL
8691                                            : NREFF),
8692                                     num);
8693                     *flagp |= HASWIDTH;
8694
8695                     Set_Node_Offset(ret, parse_start+1);
8696                     Set_Node_Cur_Length(ret); /* MJD */
8697
8698                     nextchar(pRExC_state);
8699                     return ret;
8700                 }
8701                 RExC_parse++;
8702                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8703                 /*NOTREACHED*/
8704             case '<':           /* (?<...) */
8705                 if (*RExC_parse == '!')
8706                     paren = ',';
8707                 else if (*RExC_parse != '=')
8708               named_capture:
8709                 {               /* (?<...>) */
8710                     char *name_start;
8711                     SV *svname;
8712                     paren= '>';
8713             case '\'':          /* (?'...') */
8714                     name_start= RExC_parse;
8715                     svname = reg_scan_name(pRExC_state,
8716                         SIZE_ONLY ?  /* reverse test from the others */
8717                         REG_RSN_RETURN_NAME :
8718                         REG_RSN_RETURN_NULL);
8719                     if (RExC_parse == name_start) {
8720                         RExC_parse++;
8721                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8722                         /*NOTREACHED*/
8723                     }
8724                     if (*RExC_parse != paren)
8725                         vFAIL2("Sequence (?%c... not terminated",
8726                             paren=='>' ? '<' : paren);
8727                     if (SIZE_ONLY) {
8728                         HE *he_str;
8729                         SV *sv_dat = NULL;
8730                         if (!svname) /* shouldn't happen */
8731                             Perl_croak(aTHX_
8732                                 "panic: reg_scan_name returned NULL");
8733                         if (!RExC_paren_names) {
8734                             RExC_paren_names= newHV();
8735                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8736 #ifdef DEBUGGING
8737                             RExC_paren_name_list= newAV();
8738                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8739 #endif
8740                         }
8741                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8742                         if ( he_str )
8743                             sv_dat = HeVAL(he_str);
8744                         if ( ! sv_dat ) {
8745                             /* croak baby croak */
8746                             Perl_croak(aTHX_
8747                                 "panic: paren_name hash element allocation failed");
8748                         } else if ( SvPOK(sv_dat) ) {
8749                             /* (?|...) can mean we have dupes so scan to check
8750                                its already been stored. Maybe a flag indicating
8751                                we are inside such a construct would be useful,
8752                                but the arrays are likely to be quite small, so
8753                                for now we punt -- dmq */
8754                             IV count = SvIV(sv_dat);
8755                             I32 *pv = (I32*)SvPVX(sv_dat);
8756                             IV i;
8757                             for ( i = 0 ; i < count ; i++ ) {
8758                                 if ( pv[i] == RExC_npar ) {
8759                                     count = 0;
8760                                     break;
8761                                 }
8762                             }
8763                             if ( count ) {
8764                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8765                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8766                                 pv[count] = RExC_npar;
8767                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8768                             }
8769                         } else {
8770                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8771                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8772                             SvIOK_on(sv_dat);
8773                             SvIV_set(sv_dat, 1);
8774                         }
8775 #ifdef DEBUGGING
8776                         /* Yes this does cause a memory leak in debugging Perls */
8777                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8778                             SvREFCNT_dec_NN(svname);
8779 #endif
8780
8781                         /*sv_dump(sv_dat);*/
8782                     }
8783                     nextchar(pRExC_state);
8784                     paren = 1;
8785                     goto capturing_parens;
8786                 }
8787                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8788                 RExC_in_lookbehind++;
8789                 RExC_parse++;
8790             case '=':           /* (?=...) */
8791                 RExC_seen_zerolen++;
8792                 break;
8793             case '!':           /* (?!...) */
8794                 RExC_seen_zerolen++;
8795                 if (*RExC_parse == ')') {
8796                     ret=reg_node(pRExC_state, OPFAIL);
8797                     nextchar(pRExC_state);
8798                     return ret;
8799                 }
8800                 break;
8801             case '|':           /* (?|...) */
8802                 /* branch reset, behave like a (?:...) except that
8803                    buffers in alternations share the same numbers */
8804                 paren = ':';
8805                 after_freeze = freeze_paren = RExC_npar;
8806                 break;
8807             case ':':           /* (?:...) */
8808             case '>':           /* (?>...) */
8809                 break;
8810             case '$':           /* (?$...) */
8811             case '@':           /* (?@...) */
8812                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8813                 break;
8814             case '0' :           /* (?0) */
8815             case 'R' :           /* (?R) */
8816                 if (*RExC_parse != ')')
8817                     FAIL("Sequence (?R) not terminated");
8818                 ret = reg_node(pRExC_state, GOSTART);
8819                 *flagp |= POSTPONED;
8820                 nextchar(pRExC_state);
8821                 return ret;
8822                 /*notreached*/
8823             { /* named and numeric backreferences */
8824                 I32 num;
8825             case '&':            /* (?&NAME) */
8826                 parse_start = RExC_parse - 1;
8827               named_recursion:
8828                 {
8829                     SV *sv_dat = reg_scan_name(pRExC_state,
8830                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8831                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8832                 }
8833                 goto gen_recurse_regop;
8834                 assert(0); /* NOT REACHED */
8835             case '+':
8836                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8837                     RExC_parse++;
8838                     vFAIL("Illegal pattern");
8839                 }
8840                 goto parse_recursion;
8841                 /* NOT REACHED*/
8842             case '-': /* (?-1) */
8843                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8844                     RExC_parse--; /* rewind to let it be handled later */
8845                     goto parse_flags;
8846                 }
8847                 /*FALLTHROUGH */
8848             case '1': case '2': case '3': case '4': /* (?1) */
8849             case '5': case '6': case '7': case '8': case '9':
8850                 RExC_parse--;
8851               parse_recursion:
8852                 num = atoi(RExC_parse);
8853                 parse_start = RExC_parse - 1; /* MJD */
8854                 if (*RExC_parse == '-')
8855                     RExC_parse++;
8856                 while (isDIGIT(*RExC_parse))
8857                         RExC_parse++;
8858                 if (*RExC_parse!=')')
8859                     vFAIL("Expecting close bracket");
8860
8861               gen_recurse_regop:
8862                 if ( paren == '-' ) {
8863                     /*
8864                     Diagram of capture buffer numbering.
8865                     Top line is the normal capture buffer numbers
8866                     Bottom line is the negative indexing as from
8867                     the X (the (?-2))
8868
8869                     +   1 2    3 4 5 X          6 7
8870                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8871                     -   5 4    3 2 1 X          x x
8872
8873                     */
8874                     num = RExC_npar + num;
8875                     if (num < 1)  {
8876                         RExC_parse++;
8877                         vFAIL("Reference to nonexistent group");
8878                     }
8879                 } else if ( paren == '+' ) {
8880                     num = RExC_npar + num - 1;
8881                 }
8882
8883                 ret = reganode(pRExC_state, GOSUB, num);
8884                 if (!SIZE_ONLY) {
8885                     if (num > (I32)RExC_rx->nparens) {
8886                         RExC_parse++;
8887                         vFAIL("Reference to nonexistent group");
8888                     }
8889                     ARG2L_SET( ret, RExC_recurse_count++);
8890                     RExC_emit++;
8891                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8892                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8893                 } else {
8894                     RExC_size++;
8895                 }
8896                 RExC_seen |= REG_SEEN_RECURSE;
8897                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8898                 Set_Node_Offset(ret, parse_start); /* MJD */
8899
8900                 *flagp |= POSTPONED;
8901                 nextchar(pRExC_state);
8902                 return ret;
8903             } /* named and numeric backreferences */
8904             assert(0); /* NOT REACHED */
8905
8906             case '?':           /* (??...) */
8907                 is_logical = 1;
8908                 if (*RExC_parse != '{') {
8909                     RExC_parse++;
8910                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8911                     /*NOTREACHED*/
8912                 }
8913                 *flagp |= POSTPONED;
8914                 paren = *RExC_parse++;
8915                 /* FALL THROUGH */
8916             case '{':           /* (?{...}) */
8917             {
8918                 U32 n = 0;
8919                 struct reg_code_block *cb;
8920
8921                 RExC_seen_zerolen++;
8922
8923                 if (   !pRExC_state->num_code_blocks
8924                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8925                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8926                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8927                             - RExC_start)
8928                 ) {
8929                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8930                         FAIL("panic: Sequence (?{...}): no code block found\n");
8931                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8932                 }
8933                 /* this is a pre-compiled code block (?{...}) */
8934                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8935                 RExC_parse = RExC_start + cb->end;
8936                 if (!SIZE_ONLY) {
8937                     OP *o = cb->block;
8938                     if (cb->src_regex) {
8939                         n = add_data(pRExC_state, 2, "rl");
8940                         RExC_rxi->data->data[n] =
8941                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8942                         RExC_rxi->data->data[n+1] = (void*)o;
8943                     }
8944                     else {
8945                         n = add_data(pRExC_state, 1,
8946                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8947                         RExC_rxi->data->data[n] = (void*)o;
8948                     }
8949                 }
8950                 pRExC_state->code_index++;
8951                 nextchar(pRExC_state);
8952
8953                 if (is_logical) {
8954                     regnode *eval;
8955                     ret = reg_node(pRExC_state, LOGICAL);
8956                     eval = reganode(pRExC_state, EVAL, n);
8957                     if (!SIZE_ONLY) {
8958                         ret->flags = 2;
8959                         /* for later propagation into (??{}) return value */
8960                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8961                     }
8962                     REGTAIL(pRExC_state, ret, eval);
8963                     /* deal with the length of this later - MJD */
8964                     return ret;
8965                 }
8966                 ret = reganode(pRExC_state, EVAL, n);
8967                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8968                 Set_Node_Offset(ret, parse_start);
8969                 return ret;
8970             }
8971             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8972             {
8973                 int is_define= 0;
8974                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8975                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8976                         || RExC_parse[1] == '<'
8977                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8978                         I32 flag;
8979                         regnode *tail;
8980
8981                         ret = reg_node(pRExC_state, LOGICAL);
8982                         if (!SIZE_ONLY)
8983                             ret->flags = 1;
8984
8985                         tail = reg(pRExC_state, 1, &flag, depth+1);
8986                         if (flag & RESTART_UTF8) {
8987                             *flagp = RESTART_UTF8;
8988                             return NULL;
8989                         }
8990                         REGTAIL(pRExC_state, ret, tail);
8991                         goto insert_if;
8992                     }
8993                 }
8994                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8995                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8996                 {
8997                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8998                     char *name_start= RExC_parse++;
8999                     U32 num = 0;
9000                     SV *sv_dat=reg_scan_name(pRExC_state,
9001                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9002                     if (RExC_parse == name_start || *RExC_parse != ch)
9003                         vFAIL2("Sequence (?(%c... not terminated",
9004                             (ch == '>' ? '<' : ch));
9005                     RExC_parse++;
9006                     if (!SIZE_ONLY) {
9007                         num = add_data( pRExC_state, 1, "S" );
9008                         RExC_rxi->data->data[num]=(void*)sv_dat;
9009                         SvREFCNT_inc_simple_void(sv_dat);
9010                     }
9011                     ret = reganode(pRExC_state,NGROUPP,num);
9012                     goto insert_if_check_paren;
9013                 }
9014                 else if (RExC_parse[0] == 'D' &&
9015                          RExC_parse[1] == 'E' &&
9016                          RExC_parse[2] == 'F' &&
9017                          RExC_parse[3] == 'I' &&
9018                          RExC_parse[4] == 'N' &&
9019                          RExC_parse[5] == 'E')
9020                 {
9021                     ret = reganode(pRExC_state,DEFINEP,0);
9022                     RExC_parse +=6 ;
9023                     is_define = 1;
9024                     goto insert_if_check_paren;
9025                 }
9026                 else if (RExC_parse[0] == 'R') {
9027                     RExC_parse++;
9028                     parno = 0;
9029                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9030                         parno = atoi(RExC_parse++);
9031                         while (isDIGIT(*RExC_parse))
9032                             RExC_parse++;
9033                     } else if (RExC_parse[0] == '&') {
9034                         SV *sv_dat;
9035                         RExC_parse++;
9036                         sv_dat = reg_scan_name(pRExC_state,
9037                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9038                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9039                     }
9040                     ret = reganode(pRExC_state,INSUBP,parno);
9041                     goto insert_if_check_paren;
9042                 }
9043                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9044                     /* (?(1)...) */
9045                     char c;
9046                     parno = atoi(RExC_parse++);
9047
9048                     while (isDIGIT(*RExC_parse))
9049                         RExC_parse++;
9050                     ret = reganode(pRExC_state, GROUPP, parno);
9051
9052                  insert_if_check_paren:
9053                     if ((c = *nextchar(pRExC_state)) != ')')
9054                         vFAIL("Switch condition not recognized");
9055                   insert_if:
9056                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9057                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9058                     if (br == NULL) {
9059                         if (flags & RESTART_UTF8) {
9060                             *flagp = RESTART_UTF8;
9061                             return NULL;
9062                         }
9063                         FAIL2("panic: regbranch returned NULL, flags=%#X",
9064                               flags);
9065                     } else
9066                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9067                     c = *nextchar(pRExC_state);
9068                     if (flags&HASWIDTH)
9069                         *flagp |= HASWIDTH;
9070                     if (c == '|') {
9071                         if (is_define)
9072                             vFAIL("(?(DEFINE)....) does not allow branches");
9073                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9074                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9075                             if (flags & RESTART_UTF8) {
9076                                 *flagp = RESTART_UTF8;
9077                                 return NULL;
9078                             }
9079                             FAIL2("panic: regbranch returned NULL, flags=%#X",
9080                                   flags);
9081                         }
9082                         REGTAIL(pRExC_state, ret, lastbr);
9083                         if (flags&HASWIDTH)
9084                             *flagp |= HASWIDTH;
9085                         c = *nextchar(pRExC_state);
9086                     }
9087                     else
9088                         lastbr = NULL;
9089                     if (c != ')')
9090                         vFAIL("Switch (?(condition)... contains too many branches");
9091                     ender = reg_node(pRExC_state, TAIL);
9092                     REGTAIL(pRExC_state, br, ender);
9093                     if (lastbr) {
9094                         REGTAIL(pRExC_state, lastbr, ender);
9095                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9096                     }
9097                     else
9098                         REGTAIL(pRExC_state, ret, ender);
9099                     RExC_size++; /* XXX WHY do we need this?!!
9100                                     For large programs it seems to be required
9101                                     but I can't figure out why. -- dmq*/
9102                     return ret;
9103                 }
9104                 else {
9105                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9106                 }
9107             }
9108             case '[':           /* (?[ ... ]) */
9109                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9110                                          oregcomp_parse);
9111             case 0:
9112                 RExC_parse--; /* for vFAIL to print correctly */
9113                 vFAIL("Sequence (? incomplete");
9114                 break;
9115             default: /* e.g., (?i) */
9116                 --RExC_parse;
9117               parse_flags:
9118                 parse_lparen_question_flags(pRExC_state);
9119                 if (UCHARAT(RExC_parse) != ':') {
9120                     nextchar(pRExC_state);
9121                     *flagp = TRYAGAIN;
9122                     return NULL;
9123                 }
9124                 paren = ':';
9125                 nextchar(pRExC_state);
9126                 ret = NULL;
9127                 goto parse_rest;
9128             } /* end switch */
9129         }
9130         else {                  /* (...) */
9131           capturing_parens:
9132             parno = RExC_npar;
9133             RExC_npar++;
9134
9135             ret = reganode(pRExC_state, OPEN, parno);
9136             if (!SIZE_ONLY ){
9137                 if (!RExC_nestroot)
9138                     RExC_nestroot = parno;
9139                 if (RExC_seen & REG_SEEN_RECURSE
9140                     && !RExC_open_parens[parno-1])
9141                 {
9142                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9143                         "Setting open paren #%"IVdf" to %d\n",
9144                         (IV)parno, REG_NODE_NUM(ret)));
9145                     RExC_open_parens[parno-1]= ret;
9146                 }
9147             }
9148             Set_Node_Length(ret, 1); /* MJD */
9149             Set_Node_Offset(ret, RExC_parse); /* MJD */
9150             is_open = 1;
9151         }
9152     }
9153     else                        /* ! paren */
9154         ret = NULL;
9155
9156    parse_rest:
9157     /* Pick up the branches, linking them together. */
9158     parse_start = RExC_parse;   /* MJD */
9159     br = regbranch(pRExC_state, &flags, 1,depth+1);
9160
9161     /*     branch_len = (paren != 0); */
9162
9163     if (br == NULL) {
9164         if (flags & RESTART_UTF8) {
9165             *flagp = RESTART_UTF8;
9166             return NULL;
9167         }
9168         FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9169     }
9170     if (*RExC_parse == '|') {
9171         if (!SIZE_ONLY && RExC_extralen) {
9172             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9173         }
9174         else {                  /* MJD */
9175             reginsert(pRExC_state, BRANCH, br, depth+1);
9176             Set_Node_Length(br, paren != 0);
9177             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9178         }
9179         have_branch = 1;
9180         if (SIZE_ONLY)
9181             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9182     }
9183     else if (paren == ':') {
9184         *flagp |= flags&SIMPLE;
9185     }
9186     if (is_open) {                              /* Starts with OPEN. */
9187         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9188     }
9189     else if (paren != '?')              /* Not Conditional */
9190         ret = br;
9191     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9192     lastbr = br;
9193     while (*RExC_parse == '|') {
9194         if (!SIZE_ONLY && RExC_extralen) {
9195             ender = reganode(pRExC_state, LONGJMP,0);
9196             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9197         }
9198         if (SIZE_ONLY)
9199             RExC_extralen += 2;         /* Account for LONGJMP. */
9200         nextchar(pRExC_state);
9201         if (freeze_paren) {
9202             if (RExC_npar > after_freeze)
9203                 after_freeze = RExC_npar;
9204             RExC_npar = freeze_paren;
9205         }
9206         br = regbranch(pRExC_state, &flags, 0, depth+1);
9207
9208         if (br == NULL) {
9209             if (flags & RESTART_UTF8) {
9210                 *flagp = RESTART_UTF8;
9211                 return NULL;
9212             }
9213             FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9214         }
9215         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9216         lastbr = br;
9217         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9218     }
9219
9220     if (have_branch || paren != ':') {
9221         /* Make a closing node, and hook it on the end. */
9222         switch (paren) {
9223         case ':':
9224             ender = reg_node(pRExC_state, TAIL);
9225             break;
9226         case 1:
9227             ender = reganode(pRExC_state, CLOSE, parno);
9228             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9229                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9230                         "Setting close paren #%"IVdf" to %d\n",
9231                         (IV)parno, REG_NODE_NUM(ender)));
9232                 RExC_close_parens[parno-1]= ender;
9233                 if (RExC_nestroot == parno)
9234                     RExC_nestroot = 0;
9235             }
9236             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9237             Set_Node_Length(ender,1); /* MJD */
9238             break;
9239         case '<':
9240         case ',':
9241         case '=':
9242         case '!':
9243             *flagp &= ~HASWIDTH;
9244             /* FALL THROUGH */
9245         case '>':
9246             ender = reg_node(pRExC_state, SUCCEED);
9247             break;
9248         case 0:
9249             ender = reg_node(pRExC_state, END);
9250             if (!SIZE_ONLY) {
9251                 assert(!RExC_opend); /* there can only be one! */
9252                 RExC_opend = ender;
9253             }
9254             break;
9255         }
9256         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9257             SV * const mysv_val1=sv_newmortal();
9258             SV * const mysv_val2=sv_newmortal();
9259             DEBUG_PARSE_MSG("lsbr");
9260             regprop(RExC_rx, mysv_val1, lastbr);
9261             regprop(RExC_rx, mysv_val2, ender);
9262             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9263                           SvPV_nolen_const(mysv_val1),
9264                           (IV)REG_NODE_NUM(lastbr),
9265                           SvPV_nolen_const(mysv_val2),
9266                           (IV)REG_NODE_NUM(ender),
9267                           (IV)(ender - lastbr)
9268             );
9269         });
9270         REGTAIL(pRExC_state, lastbr, ender);
9271
9272         if (have_branch && !SIZE_ONLY) {
9273             char is_nothing= 1;
9274             if (depth==1)
9275                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9276
9277             /* Hook the tails of the branches to the closing node. */
9278             for (br = ret; br; br = regnext(br)) {
9279                 const U8 op = PL_regkind[OP(br)];
9280                 if (op == BRANCH) {
9281                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9282                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9283                         is_nothing= 0;
9284                 }
9285                 else if (op == BRANCHJ) {
9286                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9287                     /* for now we always disable this optimisation * /
9288                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9289                     */
9290                         is_nothing= 0;
9291                 }
9292             }
9293             if (is_nothing) {
9294                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9295                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9296                     SV * const mysv_val1=sv_newmortal();
9297                     SV * const mysv_val2=sv_newmortal();
9298                     DEBUG_PARSE_MSG("NADA");
9299                     regprop(RExC_rx, mysv_val1, ret);
9300                     regprop(RExC_rx, mysv_val2, ender);
9301                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9302                                   SvPV_nolen_const(mysv_val1),
9303                                   (IV)REG_NODE_NUM(ret),
9304                                   SvPV_nolen_const(mysv_val2),
9305                                   (IV)REG_NODE_NUM(ender),
9306                                   (IV)(ender - ret)
9307                     );
9308                 });
9309                 OP(br)= NOTHING;
9310                 if (OP(ender) == TAIL) {
9311                     NEXT_OFF(br)= 0;
9312                     RExC_emit= br + 1;
9313                 } else {
9314                     regnode *opt;
9315                     for ( opt= br + 1; opt < ender ; opt++ )
9316                         OP(opt)= OPTIMIZED;
9317                     NEXT_OFF(br)= ender - br;
9318                 }
9319             }
9320         }
9321     }
9322
9323     {
9324         const char *p;
9325         static const char parens[] = "=!<,>";
9326
9327         if (paren && (p = strchr(parens, paren))) {
9328             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9329             int flag = (p - parens) > 1;
9330
9331             if (paren == '>')
9332                 node = SUSPEND, flag = 0;
9333             reginsert(pRExC_state, node,ret, depth+1);
9334             Set_Node_Cur_Length(ret);
9335             Set_Node_Offset(ret, parse_start + 1);
9336             ret->flags = flag;
9337             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9338         }
9339     }
9340
9341     /* Check for proper termination. */
9342     if (paren) {
9343         RExC_flags = oregflags;
9344         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9345             RExC_parse = oregcomp_parse;
9346             vFAIL("Unmatched (");
9347         }
9348     }
9349     else if (!paren && RExC_parse < RExC_end) {
9350         if (*RExC_parse == ')') {
9351             RExC_parse++;
9352             vFAIL("Unmatched )");
9353         }
9354         else
9355             FAIL("Junk on end of regexp");      /* "Can't happen". */
9356         assert(0); /* NOTREACHED */
9357     }
9358
9359     if (RExC_in_lookbehind) {
9360         RExC_in_lookbehind--;
9361     }
9362     if (after_freeze > RExC_npar)
9363         RExC_npar = after_freeze;
9364     return(ret);
9365 }
9366
9367 /*
9368  - regbranch - one alternative of an | operator
9369  *
9370  * Implements the concatenation operator.
9371  *
9372  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9373  * restarted.
9374  */
9375 STATIC regnode *
9376 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9377 {
9378     dVAR;
9379     regnode *ret;
9380     regnode *chain = NULL;
9381     regnode *latest;
9382     I32 flags = 0, c = 0;
9383     GET_RE_DEBUG_FLAGS_DECL;
9384
9385     PERL_ARGS_ASSERT_REGBRANCH;
9386
9387     DEBUG_PARSE("brnc");
9388
9389     if (first)
9390         ret = NULL;
9391     else {
9392         if (!SIZE_ONLY && RExC_extralen)
9393             ret = reganode(pRExC_state, BRANCHJ,0);
9394         else {
9395             ret = reg_node(pRExC_state, BRANCH);
9396             Set_Node_Length(ret, 1);
9397         }
9398     }
9399
9400     if (!first && SIZE_ONLY)
9401         RExC_extralen += 1;                     /* BRANCHJ */
9402
9403     *flagp = WORST;                     /* Tentatively. */
9404
9405     RExC_parse--;
9406     nextchar(pRExC_state);
9407     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9408         flags &= ~TRYAGAIN;
9409         latest = regpiece(pRExC_state, &flags,depth+1);
9410         if (latest == NULL) {
9411             if (flags & TRYAGAIN)
9412                 continue;
9413             if (flags & RESTART_UTF8) {
9414                 *flagp = RESTART_UTF8;
9415                 return NULL;
9416             }
9417             FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
9418         }
9419         else if (ret == NULL)
9420             ret = latest;
9421         *flagp |= flags&(HASWIDTH|POSTPONED);
9422         if (chain == NULL)      /* First piece. */
9423             *flagp |= flags&SPSTART;
9424         else {
9425             RExC_naughty++;
9426             REGTAIL(pRExC_state, chain, latest);
9427         }
9428         chain = latest;
9429         c++;
9430     }
9431     if (chain == NULL) {        /* Loop ran zero times. */
9432         chain = reg_node(pRExC_state, NOTHING);
9433         if (ret == NULL)
9434             ret = chain;
9435     }
9436     if (c == 1) {
9437         *flagp |= flags&SIMPLE;
9438     }
9439
9440     return ret;
9441 }
9442
9443 /*
9444  - regpiece - something followed by possible [*+?]
9445  *
9446  * Note that the branching code sequences used for ? and the general cases
9447  * of * and + are somewhat optimized:  they use the same NOTHING node as
9448  * both the endmarker for their branch list and the body of the last branch.
9449  * It might seem that this node could be dispensed with entirely, but the
9450  * endmarker role is not redundant.
9451  *
9452  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9453  * TRYAGAIN.
9454  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9455  * restarted.
9456  */
9457 STATIC regnode *
9458 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9459 {
9460     dVAR;
9461     regnode *ret;
9462     char op;
9463     char *next;
9464     I32 flags;
9465     const char * const origparse = RExC_parse;
9466     I32 min;
9467     I32 max = REG_INFTY;
9468 #ifdef RE_TRACK_PATTERN_OFFSETS
9469     char *parse_start;
9470 #endif
9471     const char *maxpos = NULL;
9472
9473     /* Save the original in case we change the emitted regop to a FAIL. */
9474     regnode * const orig_emit = RExC_emit;
9475
9476     GET_RE_DEBUG_FLAGS_DECL;
9477
9478     PERL_ARGS_ASSERT_REGPIECE;
9479
9480     DEBUG_PARSE("piec");
9481
9482     ret = regatom(pRExC_state, &flags,depth+1);
9483     if (ret == NULL) {
9484         if (flags & (TRYAGAIN|RESTART_UTF8))
9485             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9486         else
9487             FAIL2("panic: regatom returned NULL, flags=%#X", flags);
9488         return(NULL);
9489     }
9490
9491     op = *RExC_parse;
9492
9493     if (op == '{' && regcurly(RExC_parse, FALSE)) {
9494         maxpos = NULL;
9495 #ifdef RE_TRACK_PATTERN_OFFSETS
9496         parse_start = RExC_parse; /* MJD */
9497 #endif
9498         next = RExC_parse + 1;
9499         while (isDIGIT(*next) || *next == ',') {
9500             if (*next == ',') {
9501                 if (maxpos)
9502                     break;
9503                 else
9504                     maxpos = next;
9505             }
9506             next++;
9507         }
9508         if (*next == '}') {             /* got one */
9509             if (!maxpos)
9510                 maxpos = next;
9511             RExC_parse++;
9512             min = atoi(RExC_parse);
9513             if (*maxpos == ',')
9514                 maxpos++;
9515             else
9516                 maxpos = RExC_parse;
9517             max = atoi(maxpos);
9518             if (!max && *maxpos != '0')
9519                 max = REG_INFTY;                /* meaning "infinity" */
9520             else if (max >= REG_INFTY)
9521                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9522             RExC_parse = next;
9523             nextchar(pRExC_state);
9524             if (max < min) {    /* If can't match, warn and optimize to fail
9525                                    unconditionally */
9526                 if (SIZE_ONLY) {
9527                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9528
9529                     /* We can't back off the size because we have to reserve
9530                      * enough space for all the things we are about to throw
9531                      * away, but we can shrink it by the ammount we are about
9532                      * to re-use here */
9533                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9534                 }
9535                 else {
9536                     RExC_emit = orig_emit;
9537                 }
9538                 ret = reg_node(pRExC_state, OPFAIL);
9539                 return ret;
9540             }
9541             else if (max == 0) {    /* replace {0} with a nothing node */
9542                 if (SIZE_ONLY) {
9543                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9544                 }
9545                 else {
9546                     RExC_emit = orig_emit;
9547                 }
9548                 ret = reg_node(pRExC_state, NOTHING);
9549                 return ret;
9550             }
9551
9552         do_curly:
9553             if ((flags&SIMPLE)) {
9554                 RExC_naughty += 2 + RExC_naughty / 2;
9555                 reginsert(pRExC_state, CURLY, ret, depth+1);
9556                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9557                 Set_Node_Cur_Length(ret);
9558             }
9559             else {
9560                 regnode * const w = reg_node(pRExC_state, WHILEM);
9561
9562                 w->flags = 0;
9563                 REGTAIL(pRExC_state, ret, w);
9564                 if (!SIZE_ONLY && RExC_extralen) {
9565                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9566                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9567                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9568                 }
9569                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9570                                 /* MJD hk */
9571                 Set_Node_Offset(ret, parse_start+1);
9572                 Set_Node_Length(ret,
9573                                 op == '{' ? (RExC_parse - parse_start) : 1);
9574
9575                 if (!SIZE_ONLY && RExC_extralen)
9576                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9577                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9578                 if (SIZE_ONLY)
9579                     RExC_whilem_seen++, RExC_extralen += 3;
9580                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9581             }
9582             ret->flags = 0;
9583
9584             if (min > 0)
9585                 *flagp = WORST;
9586             if (max > 0)
9587                 *flagp |= HASWIDTH;
9588             if (!SIZE_ONLY) {
9589                 ARG1_SET(ret, (U16)min);
9590                 ARG2_SET(ret, (U16)max);
9591             }
9592
9593             goto nest_check;
9594         }
9595     }
9596
9597     if (!ISMULT1(op)) {
9598         *flagp = flags;
9599         return(ret);
9600     }
9601
9602 #if 0                           /* Now runtime fix should be reliable. */
9603
9604     /* if this is reinstated, don't forget to put this back into perldiag:
9605
9606             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9607
9608            (F) The part of the regexp subject to either the * or + quantifier
9609            could match an empty string. The {#} shows in the regular
9610            expression about where the problem was discovered.
9611
9612     */
9613
9614     if (!(flags&HASWIDTH) && op != '?')
9615       vFAIL("Regexp *+ operand could be empty");
9616 #endif
9617
9618 #ifdef RE_TRACK_PATTERN_OFFSETS
9619     parse_start = RExC_parse;
9620 #endif
9621     nextchar(pRExC_state);
9622
9623     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9624
9625     if (op == '*' && (flags&SIMPLE)) {
9626         reginsert(pRExC_state, STAR, ret, depth+1);
9627         ret->flags = 0;
9628         RExC_naughty += 4;
9629     }
9630     else if (op == '*') {
9631         min = 0;
9632         goto do_curly;
9633     }
9634     else if (op == '+' && (flags&SIMPLE)) {
9635         reginsert(pRExC_state, PLUS, ret, depth+1);
9636         ret->flags = 0;
9637         RExC_naughty += 3;
9638     }
9639     else if (op == '+') {
9640         min = 1;
9641         goto do_curly;
9642     }
9643     else if (op == '?') {
9644         min = 0; max = 1;
9645         goto do_curly;
9646     }
9647   nest_check:
9648     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9649         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9650         ckWARN3reg(RExC_parse,
9651                    "%.*s matches null string many times",
9652                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9653                    origparse);
9654         (void)ReREFCNT_inc(RExC_rx_sv);
9655     }
9656
9657     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9658         nextchar(pRExC_state);
9659         reginsert(pRExC_state, MINMOD, ret, depth+1);
9660         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9661     }
9662 #ifndef REG_ALLOW_MINMOD_SUSPEND
9663     else
9664 #endif
9665     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9666         regnode *ender;
9667         nextchar(pRExC_state);
9668         ender = reg_node(pRExC_state, SUCCEED);
9669         REGTAIL(pRExC_state, ret, ender);
9670         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9671         ret->flags = 0;
9672         ender = reg_node(pRExC_state, TAIL);
9673         REGTAIL(pRExC_state, ret, ender);
9674         /*ret= ender;*/
9675     }
9676
9677     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9678         RExC_parse++;
9679         vFAIL("Nested quantifiers");
9680     }
9681
9682     return(ret);
9683 }
9684
9685 STATIC bool
9686 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9687         const bool strict   /* Apply stricter parsing rules? */
9688     )
9689 {
9690
9691  /* This is expected to be called by a parser routine that has recognized '\N'
9692    and needs to handle the rest. RExC_parse is expected to point at the first
9693    char following the N at the time of the call.  On successful return,
9694    RExC_parse has been updated to point to just after the sequence identified
9695    by this routine, and <*flagp> has been updated.
9696
9697    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9698    character class.
9699
9700    \N may begin either a named sequence, or if outside a character class, mean
9701    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9702    attempted to decide which, and in the case of a named sequence, converted it
9703    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9704    where c1... are the characters in the sequence.  For single-quoted regexes,
9705    the tokenizer passes the \N sequence through unchanged; this code will not
9706    attempt to determine this nor expand those, instead raising a syntax error.
9707    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9708    or there is no '}', it signals that this \N occurrence means to match a
9709    non-newline.
9710
9711    Only the \N{U+...} form should occur in a character class, for the same
9712    reason that '.' inside a character class means to just match a period: it
9713    just doesn't make sense.
9714
9715    The function raises an error (via vFAIL), and doesn't return for various
9716    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9717    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9718    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9719    only possible if node_p is non-NULL.
9720
9721
9722    If <valuep> is non-null, it means the caller can accept an input sequence
9723    consisting of a just a single code point; <*valuep> is set to that value
9724    if the input is such.
9725
9726    If <node_p> is non-null it signifies that the caller can accept any other
9727    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9728    is set as follows:
9729     1) \N means not-a-NL: points to a newly created REG_ANY node;
9730     2) \N{}:              points to a new NOTHING node;
9731     3) otherwise:         points to a new EXACT node containing the resolved
9732                           string.
9733    Note that FALSE is returned for single code point sequences if <valuep> is
9734    null.
9735  */
9736
9737     char * endbrace;    /* '}' following the name */
9738     char* p;
9739     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9740                            stream */
9741     bool has_multiple_chars; /* true if the input stream contains a sequence of
9742                                 more than one character */
9743
9744     GET_RE_DEBUG_FLAGS_DECL;
9745
9746     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9747
9748     GET_RE_DEBUG_FLAGS;
9749
9750     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9751
9752     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9753      * modifier.  The other meaning does not */
9754     p = (RExC_flags & RXf_PMf_EXTENDED)
9755         ? regwhite( pRExC_state, RExC_parse )
9756         : RExC_parse;
9757
9758     /* Disambiguate between \N meaning a named character versus \N meaning
9759      * [^\n].  The former is assumed when it can't be the latter. */
9760     if (*p != '{' || regcurly(p, FALSE)) {
9761         RExC_parse = p;
9762         if (! node_p) {
9763             /* no bare \N in a charclass */
9764             if (in_char_class) {
9765                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9766             }
9767             return FALSE;
9768         }
9769         nextchar(pRExC_state);
9770         *node_p = reg_node(pRExC_state, REG_ANY);
9771         *flagp |= HASWIDTH|SIMPLE;
9772         RExC_naughty++;
9773         RExC_parse--;
9774         Set_Node_Length(*node_p, 1); /* MJD */
9775         return TRUE;
9776     }
9777
9778     /* Here, we have decided it should be a named character or sequence */
9779
9780     /* The test above made sure that the next real character is a '{', but
9781      * under the /x modifier, it could be separated by space (or a comment and
9782      * \n) and this is not allowed (for consistency with \x{...} and the
9783      * tokenizer handling of \N{NAME}). */
9784     if (*RExC_parse != '{') {
9785         vFAIL("Missing braces on \\N{}");
9786     }
9787
9788     RExC_parse++;       /* Skip past the '{' */
9789
9790     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9791         || ! (endbrace == RExC_parse            /* nothing between the {} */
9792               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9793                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9794     {
9795         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9796         vFAIL("\\N{NAME} must be resolved by the lexer");
9797     }
9798
9799     if (endbrace == RExC_parse) {   /* empty: \N{} */
9800         bool ret = TRUE;
9801         if (node_p) {
9802             *node_p = reg_node(pRExC_state,NOTHING);
9803         }
9804         else if (in_char_class) {
9805             if (SIZE_ONLY && in_char_class) {
9806                 if (strict) {
9807                     RExC_parse++;   /* Position after the "}" */
9808                     vFAIL("Zero length \\N{}");
9809                 }
9810                 else {
9811                     ckWARNreg(RExC_parse,
9812                               "Ignoring zero length \\N{} in character class");
9813                 }
9814             }
9815             ret = FALSE;
9816         }
9817         else {
9818             return FALSE;
9819         }
9820         nextchar(pRExC_state);
9821         return ret;
9822     }
9823
9824     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9825     RExC_parse += 2;    /* Skip past the 'U+' */
9826
9827     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9828
9829     /* Code points are separated by dots.  If none, there is only one code
9830      * point, and is terminated by the brace */
9831     has_multiple_chars = (endchar < endbrace);
9832
9833     if (valuep && (! has_multiple_chars || in_char_class)) {
9834         /* We only pay attention to the first char of
9835         multichar strings being returned in char classes. I kinda wonder
9836         if this makes sense as it does change the behaviour
9837         from earlier versions, OTOH that behaviour was broken
9838         as well. XXX Solution is to recharacterize as
9839         [rest-of-class]|multi1|multi2... */
9840
9841         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9842         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9843             | PERL_SCAN_DISALLOW_PREFIX
9844             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9845
9846         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9847
9848         /* The tokenizer should have guaranteed validity, but it's possible to
9849          * bypass it by using single quoting, so check */
9850         if (length_of_hex == 0
9851             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9852         {
9853             RExC_parse += length_of_hex;        /* Includes all the valid */
9854             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9855                             ? UTF8SKIP(RExC_parse)
9856                             : 1;
9857             /* Guard against malformed utf8 */
9858             if (RExC_parse >= endchar) {
9859                 RExC_parse = endchar;
9860             }
9861             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9862         }
9863
9864         if (in_char_class && has_multiple_chars) {
9865             if (strict) {
9866                 RExC_parse = endbrace;
9867                 vFAIL("\\N{} in character class restricted to one character");
9868             }
9869             else {
9870                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9871             }
9872         }
9873
9874         RExC_parse = endbrace + 1;
9875     }
9876     else if (! node_p || ! has_multiple_chars) {
9877
9878         /* Here, the input is legal, but not according to the caller's
9879          * options.  We fail without advancing the parse, so that the
9880          * caller can try again */
9881         RExC_parse = p;
9882         return FALSE;
9883     }
9884     else {
9885
9886         /* What is done here is to convert this to a sub-pattern of the form
9887          * (?:\x{char1}\x{char2}...)
9888          * and then call reg recursively.  That way, it retains its atomicness,
9889          * while not having to worry about special handling that some code
9890          * points may have.  toke.c has converted the original Unicode values
9891          * to native, so that we can just pass on the hex values unchanged.  We
9892          * do have to set a flag to keep recoding from happening in the
9893          * recursion */
9894
9895         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9896         STRLEN len;
9897         char *orig_end = RExC_end;
9898         I32 flags;
9899
9900         while (RExC_parse < endbrace) {
9901
9902             /* Convert to notation the rest of the code understands */
9903             sv_catpv(substitute_parse, "\\x{");
9904             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9905             sv_catpv(substitute_parse, "}");
9906
9907             /* Point to the beginning of the next character in the sequence. */
9908             RExC_parse = endchar + 1;
9909             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9910         }
9911         sv_catpv(substitute_parse, ")");
9912
9913         RExC_parse = SvPV(substitute_parse, len);
9914
9915         /* Don't allow empty number */
9916         if (len < 8) {
9917             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9918         }
9919         RExC_end = RExC_parse + len;
9920
9921         /* The values are Unicode, and therefore not subject to recoding */
9922         RExC_override_recoding = 1;
9923
9924         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
9925             if (flags & RESTART_UTF8) {
9926                 *flagp = RESTART_UTF8;
9927                 return FALSE;
9928             }
9929             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
9930                   flags);
9931         }
9932         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9933
9934         RExC_parse = endbrace;
9935         RExC_end = orig_end;
9936         RExC_override_recoding = 0;
9937
9938         nextchar(pRExC_state);
9939     }
9940
9941     return TRUE;
9942 }
9943
9944
9945 /*
9946  * reg_recode
9947  *
9948  * It returns the code point in utf8 for the value in *encp.
9949  *    value: a code value in the source encoding
9950  *    encp:  a pointer to an Encode object
9951  *
9952  * If the result from Encode is not a single character,
9953  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9954  */
9955 STATIC UV
9956 S_reg_recode(pTHX_ const char value, SV **encp)
9957 {
9958     STRLEN numlen = 1;
9959     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9960     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9961     const STRLEN newlen = SvCUR(sv);
9962     UV uv = UNICODE_REPLACEMENT;
9963
9964     PERL_ARGS_ASSERT_REG_RECODE;
9965
9966     if (newlen)
9967         uv = SvUTF8(sv)
9968              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9969              : *(U8*)s;
9970
9971     if (!newlen || numlen != newlen) {
9972         uv = UNICODE_REPLACEMENT;
9973         *encp = NULL;
9974     }
9975     return uv;
9976 }
9977
9978 PERL_STATIC_INLINE U8
9979 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9980 {
9981     U8 op;
9982
9983     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9984
9985     if (! FOLD) {
9986         return EXACT;
9987     }
9988
9989     op = get_regex_charset(RExC_flags);
9990     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9991         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9992                  been, so there is no hole */
9993     }
9994
9995     return op + EXACTF;
9996 }
9997
9998 PERL_STATIC_INLINE void
9999 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10000 {
10001     /* This knows the details about sizing an EXACTish node, setting flags for
10002      * it (by setting <*flagp>, and potentially populating it with a single
10003      * character.
10004      *
10005      * If <len> (the length in bytes) is non-zero, this function assumes that
10006      * the node has already been populated, and just does the sizing.  In this
10007      * case <code_point> should be the final code point that has already been
10008      * placed into the node.  This value will be ignored except that under some
10009      * circumstances <*flagp> is set based on it.
10010      *
10011      * If <len> is zero, the function assumes that the node is to contain only
10012      * the single character given by <code_point> and calculates what <len>
10013      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10014      * additionally will populate the node's STRING with <code_point>, if <len>
10015      * is 0.  In both cases <*flagp> is appropriately set
10016      *
10017      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
10018      * folded (the latter only when the rules indicate it can match 'ss') */
10019
10020     bool len_passed_in = cBOOL(len != 0);
10021     U8 character[UTF8_MAXBYTES_CASE+1];
10022
10023     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10024
10025     if (! len_passed_in) {
10026         if (UTF) {
10027             if (FOLD) {
10028                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10029             }
10030             else {
10031                 uvchr_to_utf8( character, code_point);
10032                 len = UTF8SKIP(character);
10033             }
10034         }
10035         else if (! FOLD
10036                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10037                  || ASCII_FOLD_RESTRICTED
10038                  || ! AT_LEAST_UNI_SEMANTICS)
10039         {
10040             *character = (U8) code_point;
10041             len = 1;
10042         }
10043         else {
10044             *character = 's';
10045             *(character + 1) = 's';
10046             len = 2;
10047         }
10048     }
10049
10050     if (SIZE_ONLY) {
10051         RExC_size += STR_SZ(len);
10052     }
10053     else {
10054         RExC_emit += STR_SZ(len);
10055         STR_LEN(node) = len;
10056         if (! len_passed_in) {
10057             Copy((char *) character, STRING(node), len, char);
10058         }
10059     }
10060
10061     *flagp |= HASWIDTH;
10062
10063     /* A single character node is SIMPLE, except for the special-cased SHARP S
10064      * under /di. */
10065     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10066         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10067             || ! FOLD || ! DEPENDS_SEMANTICS))
10068     {
10069         *flagp |= SIMPLE;
10070     }
10071 }
10072
10073 /*
10074  - regatom - the lowest level
10075
10076    Try to identify anything special at the start of the pattern. If there
10077    is, then handle it as required. This may involve generating a single regop,
10078    such as for an assertion; or it may involve recursing, such as to
10079    handle a () structure.
10080
10081    If the string doesn't start with something special then we gobble up
10082    as much literal text as we can.
10083
10084    Once we have been able to handle whatever type of thing started the
10085    sequence, we return.
10086
10087    Note: we have to be careful with escapes, as they can be both literal
10088    and special, and in the case of \10 and friends, context determines which.
10089
10090    A summary of the code structure is:
10091
10092    switch (first_byte) {
10093         cases for each special:
10094             handle this special;
10095             break;
10096         case '\\':
10097             switch (2nd byte) {
10098                 cases for each unambiguous special:
10099                     handle this special;
10100                     break;
10101                 cases for each ambigous special/literal:
10102                     disambiguate;
10103                     if (special)  handle here
10104                     else goto defchar;
10105                 default: // unambiguously literal:
10106                     goto defchar;
10107             }
10108         default:  // is a literal char
10109             // FALL THROUGH
10110         defchar:
10111             create EXACTish node for literal;
10112             while (more input and node isn't full) {
10113                 switch (input_byte) {
10114                    cases for each special;
10115                        make sure parse pointer is set so that the next call to
10116                            regatom will see this special first
10117                        goto loopdone; // EXACTish node terminated by prev. char
10118                    default:
10119                        append char to EXACTISH node;
10120                 }
10121                 get next input byte;
10122             }
10123         loopdone:
10124    }
10125    return the generated node;
10126
10127    Specifically there are two separate switches for handling
10128    escape sequences, with the one for handling literal escapes requiring
10129    a dummy entry for all of the special escapes that are actually handled
10130    by the other.
10131
10132    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10133    TRYAGAIN.
10134    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10135    restarted.
10136    Otherwise does not return NULL.
10137 */
10138
10139 STATIC regnode *
10140 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10141 {
10142     dVAR;
10143     regnode *ret = NULL;
10144     I32 flags = 0;
10145     char *parse_start = RExC_parse;
10146     U8 op;
10147     int invert = 0;
10148
10149     GET_RE_DEBUG_FLAGS_DECL;
10150
10151     *flagp = WORST;             /* Tentatively. */
10152
10153     DEBUG_PARSE("atom");
10154
10155     PERL_ARGS_ASSERT_REGATOM;
10156
10157 tryagain:
10158     switch ((U8)*RExC_parse) {
10159     case '^':
10160         RExC_seen_zerolen++;
10161         nextchar(pRExC_state);
10162         if (RExC_flags & RXf_PMf_MULTILINE)
10163             ret = reg_node(pRExC_state, MBOL);
10164         else if (RExC_flags & RXf_PMf_SINGLELINE)
10165             ret = reg_node(pRExC_state, SBOL);
10166         else
10167             ret = reg_node(pRExC_state, BOL);
10168         Set_Node_Length(ret, 1); /* MJD */
10169         break;
10170     case '$':
10171         nextchar(pRExC_state);
10172         if (*RExC_parse)
10173             RExC_seen_zerolen++;
10174         if (RExC_flags & RXf_PMf_MULTILINE)
10175             ret = reg_node(pRExC_state, MEOL);
10176         else if (RExC_flags & RXf_PMf_SINGLELINE)
10177             ret = reg_node(pRExC_state, SEOL);
10178         else
10179             ret = reg_node(pRExC_state, EOL);
10180         Set_Node_Length(ret, 1); /* MJD */
10181         break;
10182     case '.':
10183         nextchar(pRExC_state);
10184         if (RExC_flags & RXf_PMf_SINGLELINE)
10185             ret = reg_node(pRExC_state, SANY);
10186         else
10187             ret = reg_node(pRExC_state, REG_ANY);
10188         *flagp |= HASWIDTH|SIMPLE;
10189         RExC_naughty++;
10190         Set_Node_Length(ret, 1); /* MJD */
10191         break;
10192     case '[':
10193     {
10194         char * const oregcomp_parse = ++RExC_parse;
10195         ret = regclass(pRExC_state, flagp,depth+1,
10196                        FALSE, /* means parse the whole char class */
10197                        TRUE, /* allow multi-char folds */
10198                        FALSE, /* don't silence non-portable warnings. */
10199                        NULL);
10200         if (*RExC_parse != ']') {
10201             RExC_parse = oregcomp_parse;
10202             vFAIL("Unmatched [");
10203         }
10204         if (ret == NULL) {
10205             if (*flagp & RESTART_UTF8)
10206                 return NULL;
10207             FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10208                   *flagp);
10209         }
10210         nextchar(pRExC_state);
10211         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10212         break;
10213     }
10214     case '(':
10215         nextchar(pRExC_state);
10216         ret = reg(pRExC_state, 1, &flags,depth+1);
10217         if (ret == NULL) {
10218                 if (flags & TRYAGAIN) {
10219                     if (RExC_parse == RExC_end) {
10220                          /* Make parent create an empty node if needed. */
10221                         *flagp |= TRYAGAIN;
10222                         return(NULL);
10223                     }
10224                     goto tryagain;
10225                 }
10226                 if (flags & RESTART_UTF8) {
10227                     *flagp = RESTART_UTF8;
10228                     return NULL;
10229                 }
10230                 FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
10231         }
10232         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10233         break;
10234     case '|':
10235     case ')':
10236         if (flags & TRYAGAIN) {
10237             *flagp |= TRYAGAIN;
10238             return NULL;
10239         }
10240         vFAIL("Internal urp");
10241                                 /* Supposed to be caught earlier. */
10242         break;
10243     case '{':
10244         if (!regcurly(RExC_parse, FALSE)) {
10245             RExC_parse++;
10246             goto defchar;
10247         }
10248         /* FALL THROUGH */
10249     case '?':
10250     case '+':
10251     case '*':
10252         RExC_parse++;
10253         vFAIL("Quantifier follows nothing");
10254         break;
10255     case '\\':
10256         /* Special Escapes
10257
10258            This switch handles escape sequences that resolve to some kind
10259            of special regop and not to literal text. Escape sequnces that
10260            resolve to literal text are handled below in the switch marked
10261            "Literal Escapes".
10262
10263            Every entry in this switch *must* have a corresponding entry
10264            in the literal escape switch. However, the opposite is not
10265            required, as the default for this switch is to jump to the
10266            literal text handling code.
10267         */
10268         switch ((U8)*++RExC_parse) {
10269             U8 arg;
10270         /* Special Escapes */
10271         case 'A':
10272             RExC_seen_zerolen++;
10273             ret = reg_node(pRExC_state, SBOL);
10274             *flagp |= SIMPLE;
10275             goto finish_meta_pat;
10276         case 'G':
10277             ret = reg_node(pRExC_state, GPOS);
10278             RExC_seen |= REG_SEEN_GPOS;
10279             *flagp |= SIMPLE;
10280             goto finish_meta_pat;
10281         case 'K':
10282             RExC_seen_zerolen++;
10283             ret = reg_node(pRExC_state, KEEPS);
10284             *flagp |= SIMPLE;
10285             /* XXX:dmq : disabling in-place substitution seems to
10286              * be necessary here to avoid cases of memory corruption, as
10287              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10288              */
10289             RExC_seen |= REG_SEEN_LOOKBEHIND;
10290             goto finish_meta_pat;
10291         case 'Z':
10292             ret = reg_node(pRExC_state, SEOL);
10293             *flagp |= SIMPLE;
10294             RExC_seen_zerolen++;                /* Do not optimize RE away */
10295             goto finish_meta_pat;
10296         case 'z':
10297             ret = reg_node(pRExC_state, EOS);
10298             *flagp |= SIMPLE;
10299             RExC_seen_zerolen++;                /* Do not optimize RE away */
10300             goto finish_meta_pat;
10301         case 'C':
10302             ret = reg_node(pRExC_state, CANY);
10303             RExC_seen |= REG_SEEN_CANY;
10304             *flagp |= HASWIDTH|SIMPLE;
10305             goto finish_meta_pat;
10306         case 'X':
10307             ret = reg_node(pRExC_state, CLUMP);
10308             *flagp |= HASWIDTH;
10309             goto finish_meta_pat;
10310
10311         case 'W':
10312             invert = 1;
10313             /* FALLTHROUGH */
10314         case 'w':
10315             arg = ANYOF_WORDCHAR;
10316             goto join_posix;
10317
10318         case 'b':
10319             RExC_seen_zerolen++;
10320             RExC_seen |= REG_SEEN_LOOKBEHIND;
10321             op = BOUND + get_regex_charset(RExC_flags);
10322             if (op > BOUNDA) {  /* /aa is same as /a */
10323                 op = BOUNDA;
10324             }
10325             ret = reg_node(pRExC_state, op);
10326             FLAGS(ret) = get_regex_charset(RExC_flags);
10327             *flagp |= SIMPLE;
10328             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10329                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10330             }
10331             goto finish_meta_pat;
10332         case 'B':
10333             RExC_seen_zerolen++;
10334             RExC_seen |= REG_SEEN_LOOKBEHIND;
10335             op = NBOUND + get_regex_charset(RExC_flags);
10336             if (op > NBOUNDA) { /* /aa is same as /a */
10337                 op = NBOUNDA;
10338             }
10339             ret = reg_node(pRExC_state, op);
10340             FLAGS(ret) = get_regex_charset(RExC_flags);
10341             *flagp |= SIMPLE;
10342             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10343                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10344             }
10345             goto finish_meta_pat;
10346
10347         case 'D':
10348             invert = 1;
10349             /* FALLTHROUGH */
10350         case 'd':
10351             arg = ANYOF_DIGIT;
10352             goto join_posix;
10353
10354         case 'R':
10355             ret = reg_node(pRExC_state, LNBREAK);
10356             *flagp |= HASWIDTH|SIMPLE;
10357             goto finish_meta_pat;
10358
10359         case 'H':
10360             invert = 1;
10361             /* FALLTHROUGH */
10362         case 'h':
10363             arg = ANYOF_BLANK;
10364             op = POSIXU;
10365             goto join_posix_op_known;
10366
10367         case 'V':
10368             invert = 1;
10369             /* FALLTHROUGH */
10370         case 'v':
10371             arg = ANYOF_VERTWS;
10372             op = POSIXU;
10373             goto join_posix_op_known;
10374
10375         case 'S':
10376             invert = 1;
10377             /* FALLTHROUGH */
10378         case 's':
10379             arg = ANYOF_SPACE;
10380
10381         join_posix:
10382
10383             op = POSIXD + get_regex_charset(RExC_flags);
10384             if (op > POSIXA) {  /* /aa is same as /a */
10385                 op = POSIXA;
10386             }
10387
10388         join_posix_op_known:
10389
10390             if (invert) {
10391                 op += NPOSIXD - POSIXD;
10392             }
10393
10394             ret = reg_node(pRExC_state, op);
10395             if (! SIZE_ONLY) {
10396                 FLAGS(ret) = namedclass_to_classnum(arg);
10397             }
10398
10399             *flagp |= HASWIDTH|SIMPLE;
10400             /* FALL THROUGH */
10401
10402          finish_meta_pat:
10403             nextchar(pRExC_state);
10404             Set_Node_Length(ret, 2); /* MJD */
10405             break;
10406         case 'p':
10407         case 'P':
10408             {
10409 #ifdef DEBUGGING
10410                 char* parse_start = RExC_parse - 2;
10411 #endif
10412
10413                 RExC_parse--;
10414
10415                 ret = regclass(pRExC_state, flagp,depth+1,
10416                                TRUE, /* means just parse this element */
10417                                FALSE, /* don't allow multi-char folds */
10418                                FALSE, /* don't silence non-portable warnings.
10419                                          It would be a bug if these returned
10420                                          non-portables */
10421                                NULL);
10422                 /* regclass() can only return RESTART_UTF8 if multi-char folds
10423                    are allowed.  */
10424                 if (!ret)
10425                     FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10426                           *flagp);
10427
10428                 RExC_parse--;
10429
10430                 Set_Node_Offset(ret, parse_start + 2);
10431                 Set_Node_Cur_Length(ret);
10432                 nextchar(pRExC_state);
10433             }
10434             break;
10435         case 'N':
10436             /* Handle \N and \N{NAME} with multiple code points here and not
10437              * below because it can be multicharacter. join_exact() will join
10438              * them up later on.  Also this makes sure that things like
10439              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10440              * The options to the grok function call causes it to fail if the
10441              * sequence is just a single code point.  We then go treat it as
10442              * just another character in the current EXACT node, and hence it
10443              * gets uniform treatment with all the other characters.  The
10444              * special treatment for quantifiers is not needed for such single
10445              * character sequences */
10446             ++RExC_parse;
10447             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10448                                 FALSE /* not strict */ )) {
10449                 if (*flagp & RESTART_UTF8)
10450                     return NULL;
10451                 RExC_parse--;
10452                 goto defchar;
10453             }
10454             break;
10455         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10456         parse_named_seq:
10457         {
10458             char ch= RExC_parse[1];
10459             if (ch != '<' && ch != '\'' && ch != '{') {
10460                 RExC_parse++;
10461                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10462             } else {
10463                 /* this pretty much dupes the code for (?P=...) in reg(), if
10464                    you change this make sure you change that */
10465                 char* name_start = (RExC_parse += 2);
10466                 U32 num = 0;
10467                 SV *sv_dat = reg_scan_name(pRExC_state,
10468                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10469                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10470                 if (RExC_parse == name_start || *RExC_parse != ch)
10471                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10472
10473                 if (!SIZE_ONLY) {
10474                     num = add_data( pRExC_state, 1, "S" );
10475                     RExC_rxi->data->data[num]=(void*)sv_dat;
10476                     SvREFCNT_inc_simple_void(sv_dat);
10477                 }
10478
10479                 RExC_sawback = 1;
10480                 ret = reganode(pRExC_state,
10481                                ((! FOLD)
10482                                  ? NREF
10483                                  : (ASCII_FOLD_RESTRICTED)
10484                                    ? NREFFA
10485                                    : (AT_LEAST_UNI_SEMANTICS)
10486                                      ? NREFFU
10487                                      : (LOC)
10488                                        ? NREFFL
10489                                        : NREFF),
10490                                 num);
10491                 *flagp |= HASWIDTH;
10492
10493                 /* override incorrect value set in reganode MJD */
10494                 Set_Node_Offset(ret, parse_start+1);
10495                 Set_Node_Cur_Length(ret); /* MJD */
10496                 nextchar(pRExC_state);
10497
10498             }
10499             break;
10500         }
10501         case 'g':
10502         case '1': case '2': case '3': case '4':
10503         case '5': case '6': case '7': case '8': case '9':
10504             {
10505                 I32 num;
10506                 bool isg = *RExC_parse == 'g';
10507                 bool isrel = 0;
10508                 bool hasbrace = 0;
10509                 if (isg) {
10510                     RExC_parse++;
10511                     if (*RExC_parse == '{') {
10512                         RExC_parse++;
10513                         hasbrace = 1;
10514                     }
10515                     if (*RExC_parse == '-') {
10516                         RExC_parse++;
10517                         isrel = 1;
10518                     }
10519                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10520                         if (isrel) RExC_parse--;
10521                         RExC_parse -= 2;
10522                         goto parse_named_seq;
10523                 }   }
10524                 num = atoi(RExC_parse);
10525                 if (isg && num == 0)
10526                     vFAIL("Reference to invalid group 0");
10527                 if (isrel) {
10528                     num = RExC_npar - num;
10529                     if (num < 1)
10530                         vFAIL("Reference to nonexistent or unclosed group");
10531                 }
10532                 if (!isg && num > 9 && num >= RExC_npar)
10533                     /* Probably a character specified in octal, e.g. \35 */
10534                     goto defchar;
10535                 else {
10536                     char * const parse_start = RExC_parse - 1; /* MJD */
10537                     while (isDIGIT(*RExC_parse))
10538                         RExC_parse++;
10539                     if (parse_start == RExC_parse - 1)
10540                         vFAIL("Unterminated \\g... pattern");
10541                     if (hasbrace) {
10542                         if (*RExC_parse != '}')
10543                             vFAIL("Unterminated \\g{...} pattern");
10544                         RExC_parse++;
10545                     }
10546                     if (!SIZE_ONLY) {
10547                         if (num > (I32)RExC_rx->nparens)
10548                             vFAIL("Reference to nonexistent group");
10549                     }
10550                     RExC_sawback = 1;
10551                     ret = reganode(pRExC_state,
10552                                    ((! FOLD)
10553                                      ? REF
10554                                      : (ASCII_FOLD_RESTRICTED)
10555                                        ? REFFA
10556                                        : (AT_LEAST_UNI_SEMANTICS)
10557                                          ? REFFU
10558                                          : (LOC)
10559                                            ? REFFL
10560                                            : REFF),
10561                                     num);
10562                     *flagp |= HASWIDTH;
10563
10564                     /* override incorrect value set in reganode MJD */
10565                     Set_Node_Offset(ret, parse_start+1);
10566                     Set_Node_Cur_Length(ret); /* MJD */
10567                     RExC_parse--;
10568                     nextchar(pRExC_state);
10569                 }
10570             }
10571             break;
10572         case '\0':
10573             if (RExC_parse >= RExC_end)
10574                 FAIL("Trailing \\");
10575             /* FALL THROUGH */
10576         default:
10577             /* Do not generate "unrecognized" warnings here, we fall
10578                back into the quick-grab loop below */
10579             parse_start--;
10580             goto defchar;
10581         }
10582         break;
10583
10584     case '#':
10585         if (RExC_flags & RXf_PMf_EXTENDED) {
10586             if ( reg_skipcomment( pRExC_state ) )
10587                 goto tryagain;
10588         }
10589         /* FALL THROUGH */
10590
10591     default:
10592
10593             parse_start = RExC_parse - 1;
10594
10595             RExC_parse++;
10596
10597         defchar: {
10598             STRLEN len = 0;
10599             UV ender;
10600             char *p;
10601             char *s;
10602 #define MAX_NODE_STRING_SIZE 127
10603             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10604             char *s0;
10605             U8 upper_parse = MAX_NODE_STRING_SIZE;
10606             STRLEN foldlen;
10607             U8 node_type;
10608             bool next_is_quantifier;
10609             char * oldp = NULL;
10610
10611             /* If a folding node contains only code points that don't
10612              * participate in folds, it can be changed into an EXACT node,
10613              * which allows the optimizer more things to look for */
10614             bool maybe_exact;
10615
10616             ender = 0;
10617             node_type = compute_EXACTish(pRExC_state);
10618             ret = reg_node(pRExC_state, node_type);
10619
10620             /* In pass1, folded, we use a temporary buffer instead of the
10621              * actual node, as the node doesn't exist yet */
10622             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10623
10624             s0 = s;
10625
10626         reparse:
10627
10628             /* We do the EXACTFish to EXACT node only if folding, and not if in
10629              * locale, as whether a character folds or not isn't known until
10630              * runtime */
10631             maybe_exact = FOLD && ! LOC;
10632
10633             /* XXX The node can hold up to 255 bytes, yet this only goes to
10634              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10635              * 255 allows us to not have to worry about overflow due to
10636              * converting to utf8 and fold expansion, but that value is
10637              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10638              * split up by this limit into a single one using the real max of
10639              * 255.  Even at 127, this breaks under rare circumstances.  If
10640              * folding, we do not want to split a node at a character that is a
10641              * non-final in a multi-char fold, as an input string could just
10642              * happen to want to match across the node boundary.  The join
10643              * would solve that problem if the join actually happens.  But a
10644              * series of more than two nodes in a row each of 127 would cause
10645              * the first join to succeed to get to 254, but then there wouldn't
10646              * be room for the next one, which could at be one of those split
10647              * multi-char folds.  I don't know of any fool-proof solution.  One
10648              * could back off to end with only a code point that isn't such a
10649              * non-final, but it is possible for there not to be any in the
10650              * entire node. */
10651             for (p = RExC_parse - 1;
10652                  len < upper_parse && p < RExC_end;
10653                  len++)
10654             {
10655                 oldp = p;
10656
10657                 if (RExC_flags & RXf_PMf_EXTENDED)
10658                     p = regwhite( pRExC_state, p );
10659                 switch ((U8)*p) {
10660                 case '^':
10661                 case '$':
10662                 case '.':
10663                 case '[':
10664                 case '(':
10665                 case ')':
10666                 case '|':
10667                     goto loopdone;
10668                 case '\\':
10669                     /* Literal Escapes Switch
10670
10671                        This switch is meant to handle escape sequences that
10672                        resolve to a literal character.
10673
10674                        Every escape sequence that represents something
10675                        else, like an assertion or a char class, is handled
10676                        in the switch marked 'Special Escapes' above in this
10677                        routine, but also has an entry here as anything that
10678                        isn't explicitly mentioned here will be treated as
10679                        an unescaped equivalent literal.
10680                     */
10681
10682                     switch ((U8)*++p) {
10683                     /* These are all the special escapes. */
10684                     case 'A':             /* Start assertion */
10685                     case 'b': case 'B':   /* Word-boundary assertion*/
10686                     case 'C':             /* Single char !DANGEROUS! */
10687                     case 'd': case 'D':   /* digit class */
10688                     case 'g': case 'G':   /* generic-backref, pos assertion */
10689                     case 'h': case 'H':   /* HORIZWS */
10690                     case 'k': case 'K':   /* named backref, keep marker */
10691                     case 'p': case 'P':   /* Unicode property */
10692                               case 'R':   /* LNBREAK */
10693                     case 's': case 'S':   /* space class */
10694                     case 'v': case 'V':   /* VERTWS */
10695                     case 'w': case 'W':   /* word class */
10696                     case 'X':             /* eXtended Unicode "combining character sequence" */
10697                     case 'z': case 'Z':   /* End of line/string assertion */
10698                         --p;
10699                         goto loopdone;
10700
10701                     /* Anything after here is an escape that resolves to a
10702                        literal. (Except digits, which may or may not)
10703                      */
10704                     case 'n':
10705                         ender = '\n';
10706                         p++;
10707                         break;
10708                     case 'N': /* Handle a single-code point named character. */
10709                         /* The options cause it to fail if a multiple code
10710                          * point sequence.  Handle those in the switch() above
10711                          * */
10712                         RExC_parse = p + 1;
10713                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10714                                             flagp, depth, FALSE,
10715                                             FALSE /* not strict */ ))
10716                         {
10717                             if (*flagp & RESTART_UTF8)
10718                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
10719                             RExC_parse = p = oldp;
10720                             goto loopdone;
10721                         }
10722                         p = RExC_parse;
10723                         if (ender > 0xff) {
10724                             REQUIRE_UTF8;
10725                         }
10726                         break;
10727                     case 'r':
10728                         ender = '\r';
10729                         p++;
10730                         break;
10731                     case 't':
10732                         ender = '\t';
10733                         p++;
10734                         break;
10735                     case 'f':
10736                         ender = '\f';
10737                         p++;
10738                         break;
10739                     case 'e':
10740                           ender = ASCII_TO_NATIVE('\033');
10741                         p++;
10742                         break;
10743                     case 'a':
10744                           ender = ASCII_TO_NATIVE('\007');
10745                         p++;
10746                         break;
10747                     case 'o':
10748                         {
10749                             UV result;
10750                             const char* error_msg;
10751
10752                             bool valid = grok_bslash_o(&p,
10753                                                        &result,
10754                                                        &error_msg,
10755                                                        TRUE, /* out warnings */
10756                                                        FALSE, /* not strict */
10757                                                        TRUE, /* Output warnings
10758                                                                 for non-
10759                                                                 portables */
10760                                                        UTF);
10761                             if (! valid) {
10762                                 RExC_parse = p; /* going to die anyway; point
10763                                                    to exact spot of failure */
10764                                 vFAIL(error_msg);
10765                             }
10766                             ender = result;
10767                             if (PL_encoding && ender < 0x100) {
10768                                 goto recode_encoding;
10769                             }
10770                             if (ender > 0xff) {
10771                                 REQUIRE_UTF8;
10772                             }
10773                             break;
10774                         }
10775                     case 'x':
10776                         {
10777                             UV result = UV_MAX; /* initialize to erroneous
10778                                                    value */
10779                             const char* error_msg;
10780
10781                             bool valid = grok_bslash_x(&p,
10782                                                        &result,
10783                                                        &error_msg,
10784                                                        TRUE, /* out warnings */
10785                                                        FALSE, /* not strict */
10786                                                        TRUE, /* Output warnings
10787                                                                 for non-
10788                                                                 portables */
10789                                                        UTF);
10790                             if (! valid) {
10791                                 RExC_parse = p; /* going to die anyway; point
10792                                                    to exact spot of failure */
10793                                 vFAIL(error_msg);
10794                             }
10795                             ender = result;
10796
10797                             if (PL_encoding && ender < 0x100) {
10798                                 goto recode_encoding;
10799                             }
10800                             if (ender > 0xff) {
10801                                 REQUIRE_UTF8;
10802                             }
10803                             break;
10804                         }
10805                     case 'c':
10806                         p++;
10807                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10808                         break;
10809                     case '0': case '1': case '2': case '3':case '4':
10810                     case '5': case '6': case '7':
10811                         if (*p == '0' ||
10812                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10813                         {
10814                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10815                             STRLEN numlen = 3;
10816                             ender = grok_oct(p, &numlen, &flags, NULL);
10817                             if (ender > 0xff) {
10818                                 REQUIRE_UTF8;
10819                             }
10820                             p += numlen;
10821                             if (SIZE_ONLY   /* like \08, \178 */
10822                                 && numlen < 3
10823                                 && p < RExC_end
10824                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10825                             {
10826                                 reg_warn_non_literal_string(
10827                                          p + 1,
10828                                          form_short_octal_warning(p, numlen));
10829                             }
10830                         }
10831                         else {  /* Not to be treated as an octal constant, go
10832                                    find backref */
10833                             --p;
10834                             goto loopdone;
10835                         }
10836                         if (PL_encoding && ender < 0x100)
10837                             goto recode_encoding;
10838                         break;
10839                     recode_encoding:
10840                         if (! RExC_override_recoding) {
10841                             SV* enc = PL_encoding;
10842                             ender = reg_recode((const char)(U8)ender, &enc);
10843                             if (!enc && SIZE_ONLY)
10844                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10845                             REQUIRE_UTF8;
10846                         }
10847                         break;
10848                     case '\0':
10849                         if (p >= RExC_end)
10850                             FAIL("Trailing \\");
10851                         /* FALL THROUGH */
10852                     default:
10853                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10854                             /* Include any { following the alpha to emphasize
10855                              * that it could be part of an escape at some point
10856                              * in the future */
10857                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10858                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10859                         }
10860                         goto normal_default;
10861                     } /* End of switch on '\' */
10862                     break;
10863                 default:    /* A literal character */
10864
10865                     if (! SIZE_ONLY
10866                         && RExC_flags & RXf_PMf_EXTENDED
10867                         && ckWARN(WARN_DEPRECATED)
10868                         && is_PATWS_non_low(p, UTF))
10869                     {
10870                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10871                                 "Escape literal pattern white space under /x");
10872                     }
10873
10874                   normal_default:
10875                     if (UTF8_IS_START(*p) && UTF) {
10876                         STRLEN numlen;
10877                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10878                                                &numlen, UTF8_ALLOW_DEFAULT);
10879                         p += numlen;
10880                     }
10881                     else
10882                         ender = (U8) *p++;
10883                     break;
10884                 } /* End of switch on the literal */
10885
10886                 /* Here, have looked at the literal character and <ender>
10887                  * contains its ordinal, <p> points to the character after it
10888                  */
10889
10890                 if ( RExC_flags & RXf_PMf_EXTENDED)
10891                     p = regwhite( pRExC_state, p );
10892
10893                 /* If the next thing is a quantifier, it applies to this
10894                  * character only, which means that this character has to be in
10895                  * its own node and can't just be appended to the string in an
10896                  * existing node, so if there are already other characters in
10897                  * the node, close the node with just them, and set up to do
10898                  * this character again next time through, when it will be the
10899                  * only thing in its new node */
10900                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10901                 {
10902                     p = oldp;
10903                     goto loopdone;
10904                 }
10905
10906                 if (FOLD) {
10907                     if (UTF
10908                             /* See comments for join_exact() as to why we fold
10909                              * this non-UTF at compile time */
10910                         || (node_type == EXACTFU
10911                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10912                     {
10913
10914
10915                         /* Prime the casefolded buffer.  Locale rules, which
10916                          * apply only to code points < 256, aren't known until
10917                          * execution, so for them, just output the original
10918                          * character using utf8.  If we start to fold non-UTF
10919                          * patterns, be sure to update join_exact() */
10920                         if (LOC && ender < 256) {
10921                             if (UNI_IS_INVARIANT(ender)) {
10922                                 *s = (U8) ender;
10923                                 foldlen = 1;
10924                             } else {
10925                                 *s = UTF8_TWO_BYTE_HI(ender);
10926                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10927                                 foldlen = 2;
10928                             }
10929                         }
10930                         else {
10931                             UV folded = _to_uni_fold_flags(
10932                                            ender,
10933                                            (U8 *) s,
10934                                            &foldlen,
10935                                            FOLD_FLAGS_FULL
10936                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10937                                                     : (ASCII_FOLD_RESTRICTED)
10938                                                       ? FOLD_FLAGS_NOMIX_ASCII
10939                                                       : 0)
10940                                             );
10941
10942                             /* If this node only contains non-folding code
10943                              * points so far, see if this new one is also
10944                              * non-folding */
10945                             if (maybe_exact) {
10946                                 if (folded != ender) {
10947                                     maybe_exact = FALSE;
10948                                 }
10949                                 else {
10950                                     /* Here the fold is the original; we have
10951                                      * to check further to see if anything
10952                                      * folds to it */
10953                                     if (! PL_utf8_foldable) {
10954                                         SV* swash = swash_init("utf8",
10955                                                            "_Perl_Any_Folds",
10956                                                            &PL_sv_undef, 1, 0);
10957                                         PL_utf8_foldable =
10958                                                     _get_swash_invlist(swash);
10959                                         SvREFCNT_dec_NN(swash);
10960                                     }
10961                                     if (_invlist_contains_cp(PL_utf8_foldable,
10962                                                              ender))
10963                                     {
10964                                         maybe_exact = FALSE;
10965                                     }
10966                                 }
10967                             }
10968                             ender = folded;
10969                         }
10970                         s += foldlen;
10971
10972                         /* The loop increments <len> each time, as all but this
10973                          * path (and the one just below for UTF) through it add
10974                          * a single byte to the EXACTish node.  But this one
10975                          * has changed len to be the correct final value, so
10976                          * subtract one to cancel out the increment that
10977                          * follows */
10978                         len += foldlen - 1;
10979                     }
10980                     else {
10981                         *(s++) = (char) ender;
10982                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10983                     }
10984                 }
10985                 else if (UTF) {
10986                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10987                     if (unilen > 0) {
10988                        s   += unilen;
10989                        len += unilen;
10990                     }
10991
10992                     /* See comment just above for - 1 */
10993                     len--;
10994                 }
10995                 else {
10996                     REGC((char)ender, s++);
10997                 }
10998
10999                 if (next_is_quantifier) {
11000
11001                     /* Here, the next input is a quantifier, and to get here,
11002                      * the current character is the only one in the node.
11003                      * Also, here <len> doesn't include the final byte for this
11004                      * character */
11005                     len++;
11006                     goto loopdone;
11007                 }
11008
11009             } /* End of loop through literal characters */
11010
11011             /* Here we have either exhausted the input or ran out of room in
11012              * the node.  (If we encountered a character that can't be in the
11013              * node, transfer is made directly to <loopdone>, and so we
11014              * wouldn't have fallen off the end of the loop.)  In the latter
11015              * case, we artificially have to split the node into two, because
11016              * we just don't have enough space to hold everything.  This
11017              * creates a problem if the final character participates in a
11018              * multi-character fold in the non-final position, as a match that
11019              * should have occurred won't, due to the way nodes are matched,
11020              * and our artificial boundary.  So back off until we find a non-
11021              * problematic character -- one that isn't at the beginning or
11022              * middle of such a fold.  (Either it doesn't participate in any
11023              * folds, or appears only in the final position of all the folds it
11024              * does participate in.)  A better solution with far fewer false
11025              * positives, and that would fill the nodes more completely, would
11026              * be to actually have available all the multi-character folds to
11027              * test against, and to back-off only far enough to be sure that
11028              * this node isn't ending with a partial one.  <upper_parse> is set
11029              * further below (if we need to reparse the node) to include just
11030              * up through that final non-problematic character that this code
11031              * identifies, so when it is set to less than the full node, we can
11032              * skip the rest of this */
11033             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11034
11035                 const STRLEN full_len = len;
11036
11037                 assert(len >= MAX_NODE_STRING_SIZE);
11038
11039                 /* Here, <s> points to the final byte of the final character.
11040                  * Look backwards through the string until find a non-
11041                  * problematic character */
11042
11043                 if (! UTF) {
11044
11045                     /* These two have no multi-char folds to non-UTF characters
11046                      */
11047                     if (ASCII_FOLD_RESTRICTED || LOC) {
11048                         goto loopdone;
11049                     }
11050
11051                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11052                     len = s - s0 + 1;
11053                 }
11054                 else {
11055                     if (!  PL_NonL1NonFinalFold) {
11056                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11057                                         NonL1_Perl_Non_Final_Folds_invlist);
11058                     }
11059
11060                     /* Point to the first byte of the final character */
11061                     s = (char *) utf8_hop((U8 *) s, -1);
11062
11063                     while (s >= s0) {   /* Search backwards until find
11064                                            non-problematic char */
11065                         if (UTF8_IS_INVARIANT(*s)) {
11066
11067                             /* There are no ascii characters that participate
11068                              * in multi-char folds under /aa.  In EBCDIC, the
11069                              * non-ascii invariants are all control characters,
11070                              * so don't ever participate in any folds. */
11071                             if (ASCII_FOLD_RESTRICTED
11072                                 || ! IS_NON_FINAL_FOLD(*s))
11073                             {
11074                                 break;
11075                             }
11076                         }
11077                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11078
11079                             /* No Latin1 characters participate in multi-char
11080                              * folds under /l */
11081                             if (LOC
11082                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11083                                                                 *s, *(s+1))))
11084                             {
11085                                 break;
11086                             }
11087                         }
11088                         else if (! _invlist_contains_cp(
11089                                         PL_NonL1NonFinalFold,
11090                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11091                         {
11092                             break;
11093                         }
11094
11095                         /* Here, the current character is problematic in that
11096                          * it does occur in the non-final position of some
11097                          * fold, so try the character before it, but have to
11098                          * special case the very first byte in the string, so
11099                          * we don't read outside the string */
11100                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11101                     } /* End of loop backwards through the string */
11102
11103                     /* If there were only problematic characters in the string,
11104                      * <s> will point to before s0, in which case the length
11105                      * should be 0, otherwise include the length of the
11106                      * non-problematic character just found */
11107                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11108                 }
11109
11110                 /* Here, have found the final character, if any, that is
11111                  * non-problematic as far as ending the node without splitting
11112                  * it across a potential multi-char fold.  <len> contains the
11113                  * number of bytes in the node up-to and including that
11114                  * character, or is 0 if there is no such character, meaning
11115                  * the whole node contains only problematic characters.  In
11116                  * this case, give up and just take the node as-is.  We can't
11117                  * do any better */
11118                 if (len == 0) {
11119                     len = full_len;
11120                 } else {
11121
11122                     /* Here, the node does contain some characters that aren't
11123                      * problematic.  If one such is the final character in the
11124                      * node, we are done */
11125                     if (len == full_len) {
11126                         goto loopdone;
11127                     }
11128                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11129
11130                         /* If the final character is problematic, but the
11131                          * penultimate is not, back-off that last character to
11132                          * later start a new node with it */
11133                         p = oldp;
11134                         goto loopdone;
11135                     }
11136
11137                     /* Here, the final non-problematic character is earlier
11138                      * in the input than the penultimate character.  What we do
11139                      * is reparse from the beginning, going up only as far as
11140                      * this final ok one, thus guaranteeing that the node ends
11141                      * in an acceptable character.  The reason we reparse is
11142                      * that we know how far in the character is, but we don't
11143                      * know how to correlate its position with the input parse.
11144                      * An alternate implementation would be to build that
11145                      * correlation as we go along during the original parse,
11146                      * but that would entail extra work for every node, whereas
11147                      * this code gets executed only when the string is too
11148                      * large for the node, and the final two characters are
11149                      * problematic, an infrequent occurrence.  Yet another
11150                      * possible strategy would be to save the tail of the
11151                      * string, and the next time regatom is called, initialize
11152                      * with that.  The problem with this is that unless you
11153                      * back off one more character, you won't be guaranteed
11154                      * regatom will get called again, unless regbranch,
11155                      * regpiece ... are also changed.  If you do back off that
11156                      * extra character, so that there is input guaranteed to
11157                      * force calling regatom, you can't handle the case where
11158                      * just the first character in the node is acceptable.  I
11159                      * (khw) decided to try this method which doesn't have that
11160                      * pitfall; if performance issues are found, we can do a
11161                      * combination of the current approach plus that one */
11162                     upper_parse = len;
11163                     len = 0;
11164                     s = s0;
11165                     goto reparse;
11166                 }
11167             }   /* End of verifying node ends with an appropriate char */
11168
11169         loopdone:   /* Jumped to when encounters something that shouldn't be in
11170                        the node */
11171
11172             /* If 'maybe_exact' is still set here, means there are no
11173              * code points in the node that participate in folds */
11174             if (FOLD && maybe_exact) {
11175                 OP(ret) = EXACT;
11176             }
11177
11178             /* I (khw) don't know if you can get here with zero length, but the
11179              * old code handled this situation by creating a zero-length EXACT
11180              * node.  Might as well be NOTHING instead */
11181             if (len == 0) {
11182                 OP(ret) = NOTHING;
11183             }
11184             else{
11185                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11186             }
11187
11188             RExC_parse = p - 1;
11189             Set_Node_Cur_Length(ret); /* MJD */
11190             nextchar(pRExC_state);
11191             {
11192                 /* len is STRLEN which is unsigned, need to copy to signed */
11193                 IV iv = len;
11194                 if (iv < 0)
11195                     vFAIL("Internal disaster");
11196             }
11197
11198         } /* End of label 'defchar:' */
11199         break;
11200     } /* End of giant switch on input character */
11201
11202     return(ret);
11203 }
11204
11205 STATIC char *
11206 S_regwhite( RExC_state_t *pRExC_state, char *p )
11207 {
11208     const char *e = RExC_end;
11209
11210     PERL_ARGS_ASSERT_REGWHITE;
11211
11212     while (p < e) {
11213         if (isSPACE(*p))
11214             ++p;
11215         else if (*p == '#') {
11216             bool ended = 0;
11217             do {
11218                 if (*p++ == '\n') {
11219                     ended = 1;
11220                     break;
11221                 }
11222             } while (p < e);
11223             if (!ended)
11224                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11225         }
11226         else
11227             break;
11228     }
11229     return p;
11230 }
11231
11232 STATIC char *
11233 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11234 {
11235     /* Returns the next non-pattern-white space, non-comment character (the
11236      * latter only if 'recognize_comment is true) in the string p, which is
11237      * ended by RExC_end.  If there is no line break ending a comment,
11238      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11239     const char *e = RExC_end;
11240
11241     PERL_ARGS_ASSERT_REGPATWS;
11242
11243     while (p < e) {
11244         STRLEN len;
11245         if ((len = is_PATWS_safe(p, e, UTF))) {
11246             p += len;
11247         }
11248         else if (recognize_comment && *p == '#') {
11249             bool ended = 0;
11250             do {
11251                 p++;
11252                 if (is_LNBREAK_safe(p, e, UTF)) {
11253                     ended = 1;
11254                     break;
11255                 }
11256             } while (p < e);
11257             if (!ended)
11258                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11259         }
11260         else
11261             break;
11262     }
11263     return p;
11264 }
11265
11266 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11267    Character classes ([:foo:]) can also be negated ([:^foo:]).
11268    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11269    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11270    but trigger failures because they are currently unimplemented. */
11271
11272 #define POSIXCC_DONE(c)   ((c) == ':')
11273 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11274 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11275
11276 PERL_STATIC_INLINE I32
11277 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11278 {
11279     dVAR;
11280     I32 namedclass = OOB_NAMEDCLASS;
11281
11282     PERL_ARGS_ASSERT_REGPPOSIXCC;
11283
11284     if (value == '[' && RExC_parse + 1 < RExC_end &&
11285         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11286         POSIXCC(UCHARAT(RExC_parse)))
11287     {
11288         const char c = UCHARAT(RExC_parse);
11289         char* const s = RExC_parse++;
11290
11291         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11292             RExC_parse++;
11293         if (RExC_parse == RExC_end) {
11294             if (strict) {
11295
11296                 /* Try to give a better location for the error (than the end of
11297                  * the string) by looking for the matching ']' */
11298                 RExC_parse = s;
11299                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11300                     RExC_parse++;
11301                 }
11302                 vFAIL2("Unmatched '%c' in POSIX class", c);
11303             }
11304             /* Grandfather lone [:, [=, [. */
11305             RExC_parse = s;
11306         }
11307         else {
11308             const char* const t = RExC_parse++; /* skip over the c */
11309             assert(*t == c);
11310
11311             if (UCHARAT(RExC_parse) == ']') {
11312                 const char *posixcc = s + 1;
11313                 RExC_parse++; /* skip over the ending ] */
11314
11315                 if (*s == ':') {
11316                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11317                     const I32 skip = t - posixcc;
11318
11319                     /* Initially switch on the length of the name.  */
11320                     switch (skip) {
11321                     case 4:
11322                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11323                                                           this is the Perl \w
11324                                                         */
11325                             namedclass = ANYOF_WORDCHAR;
11326                         break;
11327                     case 5:
11328                         /* Names all of length 5.  */
11329                         /* alnum alpha ascii blank cntrl digit graph lower
11330                            print punct space upper  */
11331                         /* Offset 4 gives the best switch position.  */
11332                         switch (posixcc[4]) {
11333                         case 'a':
11334                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11335                                 namedclass = ANYOF_ALPHA;
11336                             break;
11337                         case 'e':
11338                             if (memEQ(posixcc, "spac", 4)) /* space */
11339                                 namedclass = ANYOF_PSXSPC;
11340                             break;
11341                         case 'h':
11342                             if (memEQ(posixcc, "grap", 4)) /* graph */
11343                                 namedclass = ANYOF_GRAPH;
11344                             break;
11345                         case 'i':
11346                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11347                                 namedclass = ANYOF_ASCII;
11348                             break;
11349                         case 'k':
11350                             if (memEQ(posixcc, "blan", 4)) /* blank */
11351                                 namedclass = ANYOF_BLANK;
11352                             break;
11353                         case 'l':
11354                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11355                                 namedclass = ANYOF_CNTRL;
11356                             break;
11357                         case 'm':
11358                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11359                                 namedclass = ANYOF_ALPHANUMERIC;
11360                             break;
11361                         case 'r':
11362                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11363                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11364                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11365                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11366                             break;
11367                         case 't':
11368                             if (memEQ(posixcc, "digi", 4)) /* digit */
11369                                 namedclass = ANYOF_DIGIT;
11370                             else if (memEQ(posixcc, "prin", 4)) /* print */
11371                                 namedclass = ANYOF_PRINT;
11372                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11373                                 namedclass = ANYOF_PUNCT;
11374                             break;
11375                         }
11376                         break;
11377                     case 6:
11378                         if (memEQ(posixcc, "xdigit", 6))
11379                             namedclass = ANYOF_XDIGIT;
11380                         break;
11381                     }
11382
11383                     if (namedclass == OOB_NAMEDCLASS)
11384                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11385                                       t - s - 1, s + 1);
11386
11387                     /* The #defines are structured so each complement is +1 to
11388                      * the normal one */
11389                     if (complement) {
11390                         namedclass++;
11391                     }
11392                     assert (posixcc[skip] == ':');
11393                     assert (posixcc[skip+1] == ']');
11394                 } else if (!SIZE_ONLY) {
11395                     /* [[=foo=]] and [[.foo.]] are still future. */
11396
11397                     /* adjust RExC_parse so the warning shows after
11398                        the class closes */
11399                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11400                         RExC_parse++;
11401                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11402                 }
11403             } else {
11404                 /* Maternal grandfather:
11405                  * "[:" ending in ":" but not in ":]" */
11406                 if (strict) {
11407                     vFAIL("Unmatched '[' in POSIX class");
11408                 }
11409
11410                 /* Grandfather lone [:, [=, [. */
11411                 RExC_parse = s;
11412             }
11413         }
11414     }
11415
11416     return namedclass;
11417 }
11418
11419 STATIC bool
11420 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11421 {
11422     /* This applies some heuristics at the current parse position (which should
11423      * be at a '[') to see if what follows might be intended to be a [:posix:]
11424      * class.  It returns true if it really is a posix class, of course, but it
11425      * also can return true if it thinks that what was intended was a posix
11426      * class that didn't quite make it.
11427      *
11428      * It will return true for
11429      *      [:alphanumerics:
11430      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11431      *                         ')' indicating the end of the (?[
11432      *      [:any garbage including %^&$ punctuation:]
11433      *
11434      * This is designed to be called only from S_handle_regex_sets; it could be
11435      * easily adapted to be called from the spot at the beginning of regclass()
11436      * that checks to see in a normal bracketed class if the surrounding []
11437      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11438      * change long-standing behavior, so I (khw) didn't do that */
11439     char* p = RExC_parse + 1;
11440     char first_char = *p;
11441
11442     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11443
11444     assert(*(p - 1) == '[');
11445
11446     if (! POSIXCC(first_char)) {
11447         return FALSE;
11448     }
11449
11450     p++;
11451     while (p < RExC_end && isWORDCHAR(*p)) p++;
11452
11453     if (p >= RExC_end) {
11454         return FALSE;
11455     }
11456
11457     if (p - RExC_parse > 2    /* Got at least 1 word character */
11458         && (*p == first_char
11459             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11460     {
11461         return TRUE;
11462     }
11463
11464     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11465
11466     return (p
11467             && p - RExC_parse > 2 /* [:] evaluates to colon;
11468                                       [::] is a bad posix class. */
11469             && first_char == *(p - 1));
11470 }
11471
11472 STATIC regnode *
11473 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11474                    char * const oregcomp_parse)
11475 {
11476     /* Handle the (?[...]) construct to do set operations */
11477
11478     U8 curchar;
11479     UV start, end;      /* End points of code point ranges */
11480     SV* result_string;
11481     char *save_end, *save_parse;
11482     SV* final;
11483     STRLEN len;
11484     regnode* node;
11485     AV* stack;
11486     const bool save_fold = FOLD;
11487
11488     GET_RE_DEBUG_FLAGS_DECL;
11489
11490     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11491
11492     if (LOC) {
11493         vFAIL("(?[...]) not valid in locale");
11494     }
11495     RExC_uni_semantics = 1;
11496
11497     /* This will return only an ANYOF regnode, or (unlikely) something smaller
11498      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11499      * call regclass to handle '[]' so as to not have to reinvent its parsing
11500      * rules here (throwing away the size it computes each time).  And, we exit
11501      * upon an unescaped ']' that isn't one ending a regclass.  To do both
11502      * these things, we need to realize that something preceded by a backslash
11503      * is escaped, so we have to keep track of backslashes */
11504     if (SIZE_ONLY) {
11505
11506         Perl_ck_warner_d(aTHX_
11507             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11508             "The regex_sets feature is experimental" REPORT_LOCATION,
11509             (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11510
11511         while (RExC_parse < RExC_end) {
11512             SV* current = NULL;
11513             RExC_parse = regpatws(pRExC_state, RExC_parse,
11514                                 TRUE); /* means recognize comments */
11515             switch (*RExC_parse) {
11516                 default:
11517                     break;
11518                 case '\\':
11519                     /* Skip the next byte (which could cause us to end up in
11520                      * the middle of a UTF-8 character, but since none of those
11521                      * are confusable with anything we currently handle in this
11522                      * switch (invariants all), it's safe.  We'll just hit the
11523                      * default: case next time and keep on incrementing until
11524                      * we find one of the invariants we do handle. */
11525                     RExC_parse++;
11526                     break;
11527                 case '[':
11528                 {
11529                     /* If this looks like it is a [:posix:] class, leave the
11530                      * parse pointer at the '[' to fool regclass() into
11531                      * thinking it is part of a '[[:posix:]]'.  That function
11532                      * will use strict checking to force a syntax error if it
11533                      * doesn't work out to a legitimate class */
11534                     bool is_posix_class
11535                                     = could_it_be_a_POSIX_class(pRExC_state);
11536                     if (! is_posix_class) {
11537                         RExC_parse++;
11538                     }
11539
11540                     /* regclass() can only return RESTART_UTF8 if multi-char
11541                        folds are allowed.  */
11542                     if (!regclass(pRExC_state, flagp,depth+1,
11543                                   is_posix_class, /* parse the whole char
11544                                                      class only if not a
11545                                                      posix class */
11546                                   FALSE, /* don't allow multi-char folds */
11547                                   TRUE, /* silence non-portable warnings. */
11548                                   &current))
11549                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11550                               *flagp);
11551
11552                     /* function call leaves parse pointing to the ']', except
11553                      * if we faked it */
11554                     if (is_posix_class) {
11555                         RExC_parse--;
11556                     }
11557
11558                     SvREFCNT_dec(current);   /* In case it returned something */
11559                     break;
11560                 }
11561
11562                 case ']':
11563                     RExC_parse++;
11564                     if (RExC_parse < RExC_end
11565                         && *RExC_parse == ')')
11566                     {
11567                         node = reganode(pRExC_state, ANYOF, 0);
11568                         RExC_size += ANYOF_SKIP;
11569                         nextchar(pRExC_state);
11570                         Set_Node_Length(node,
11571                                 RExC_parse - oregcomp_parse + 1); /* MJD */
11572                         return node;
11573                     }
11574                     goto no_close;
11575             }
11576             RExC_parse++;
11577         }
11578
11579         no_close:
11580         FAIL("Syntax error in (?[...])");
11581     }
11582
11583     /* Pass 2 only after this.  Everything in this construct is a
11584      * metacharacter.  Operands begin with either a '\' (for an escape
11585      * sequence), or a '[' for a bracketed character class.  Any other
11586      * character should be an operator, or parenthesis for grouping.  Both
11587      * types of operands are handled by calling regclass() to parse them.  It
11588      * is called with a parameter to indicate to return the computed inversion
11589      * list.  The parsing here is implemented via a stack.  Each entry on the
11590      * stack is a single character representing one of the operators, or the
11591      * '('; or else a pointer to an operand inversion list. */
11592
11593 #define IS_OPERAND(a)  (! SvIOK(a))
11594
11595     /* The stack starts empty.  It is a syntax error if the first thing parsed
11596      * is a binary operator; everything else is pushed on the stack.  When an
11597      * operand is parsed, the top of the stack is examined.  If it is a binary
11598      * operator, the item before it should be an operand, and both are replaced
11599      * by the result of doing that operation on the new operand and the one on
11600      * the stack.   Thus a sequence of binary operands is reduced to a single
11601      * one before the next one is parsed.
11602      *
11603      * A unary operator may immediately follow a binary in the input, for
11604      * example
11605      *      [a] + ! [b]
11606      * When an operand is parsed and the top of the stack is a unary operator,
11607      * the operation is performed, and then the stack is rechecked to see if
11608      * this new operand is part of a binary operation; if so, it is handled as
11609      * above.
11610      *
11611      * A '(' is simply pushed on the stack; it is valid only if the stack is
11612      * empty, or the top element of the stack is an operator or another '('
11613      * (for which the parenthesized expression will become an operand).  By the
11614      * time the corresponding ')' is parsed everything in between should have
11615      * been parsed and evaluated to a single operand (or else is a syntax
11616      * error), and is handled as a regular operand */
11617
11618     stack = newAV();
11619
11620     while (RExC_parse < RExC_end) {
11621         I32 top_index = av_tindex(stack);
11622         SV** top_ptr;
11623         SV* current = NULL;
11624
11625         /* Skip white space */
11626         RExC_parse = regpatws(pRExC_state, RExC_parse,
11627                                 TRUE); /* means recognize comments */
11628         if (RExC_parse >= RExC_end) {
11629             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11630         }
11631         if ((curchar = UCHARAT(RExC_parse)) == ']') {
11632             break;
11633         }
11634
11635         switch (curchar) {
11636
11637             case '?':
11638                 if (av_tindex(stack) >= 0   /* This makes sure that we can
11639                                                safely subtract 1 from
11640                                                RExC_parse in the next clause.
11641                                                If we have something on the
11642                                                stack, we have parsed something
11643                                              */
11644                     && UCHARAT(RExC_parse - 1) == '('
11645                     && RExC_parse < RExC_end)
11646                 {
11647                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11648                      * This happens when we have some thing like
11649                      *
11650                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11651                      *   ...
11652                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11653                      *
11654                      * Here we would be handling the interpolated
11655                      * '$thai_or_lao'.  We handle this by a recursive call to
11656                      * ourselves which returns the inversion list the
11657                      * interpolated expression evaluates to.  We use the flags
11658                      * from the interpolated pattern. */
11659                     U32 save_flags = RExC_flags;
11660                     const char * const save_parse = ++RExC_parse;
11661
11662                     parse_lparen_question_flags(pRExC_state);
11663
11664                     if (RExC_parse == save_parse  /* Makes sure there was at
11665                                                      least one flag (or this
11666                                                      embedding wasn't compiled)
11667                                                    */
11668                         || RExC_parse >= RExC_end - 4
11669                         || UCHARAT(RExC_parse) != ':'
11670                         || UCHARAT(++RExC_parse) != '('
11671                         || UCHARAT(++RExC_parse) != '?'
11672                         || UCHARAT(++RExC_parse) != '[')
11673                     {
11674
11675                         /* In combination with the above, this moves the
11676                          * pointer to the point just after the first erroneous
11677                          * character (or if there are no flags, to where they
11678                          * should have been) */
11679                         if (RExC_parse >= RExC_end - 4) {
11680                             RExC_parse = RExC_end;
11681                         }
11682                         else if (RExC_parse != save_parse) {
11683                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11684                         }
11685                         vFAIL("Expecting '(?flags:(?[...'");
11686                     }
11687                     RExC_parse++;
11688                     (void) handle_regex_sets(pRExC_state, &current, flagp,
11689                                                     depth+1, oregcomp_parse);
11690
11691                     /* Here, 'current' contains the embedded expression's
11692                      * inversion list, and RExC_parse points to the trailing
11693                      * ']'; the next character should be the ')' which will be
11694                      * paired with the '(' that has been put on the stack, so
11695                      * the whole embedded expression reduces to '(operand)' */
11696                     RExC_parse++;
11697
11698                     RExC_flags = save_flags;
11699                     goto handle_operand;
11700                 }
11701                 /* FALL THROUGH */
11702
11703             default:
11704                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11705                 vFAIL("Unexpected character");
11706
11707             case '\\':
11708                 /* regclass() can only return RESTART_UTF8 if multi-char
11709                    folds are allowed.  */
11710                 if (!regclass(pRExC_state, flagp,depth+1,
11711                               TRUE, /* means parse just the next thing */
11712                               FALSE, /* don't allow multi-char folds */
11713                               FALSE, /* don't silence non-portable warnings.  */
11714                               &current))
11715                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11716                           *flagp);
11717                 /* regclass() will return with parsing just the \ sequence,
11718                  * leaving the parse pointer at the next thing to parse */
11719                 RExC_parse--;
11720                 goto handle_operand;
11721
11722             case '[':   /* Is a bracketed character class */
11723             {
11724                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11725
11726                 if (! is_posix_class) {
11727                     RExC_parse++;
11728                 }
11729
11730                 /* regclass() can only return RESTART_UTF8 if multi-char
11731                    folds are allowed.  */
11732                 if(!regclass(pRExC_state, flagp,depth+1,
11733                              is_posix_class, /* parse the whole char class
11734                                                 only if not a posix class */
11735                              FALSE, /* don't allow multi-char folds */
11736                              FALSE, /* don't silence non-portable warnings.  */
11737                              &current))
11738                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11739                           *flagp);
11740                 /* function call leaves parse pointing to the ']', except if we
11741                  * faked it */
11742                 if (is_posix_class) {
11743                     RExC_parse--;
11744                 }
11745
11746                 goto handle_operand;
11747             }
11748
11749             case '&':
11750             case '|':
11751             case '+':
11752             case '-':
11753             case '^':
11754                 if (top_index < 0
11755                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11756                     || ! IS_OPERAND(*top_ptr))
11757                 {
11758                     RExC_parse++;
11759                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11760                 }
11761                 av_push(stack, newSVuv(curchar));
11762                 break;
11763
11764             case '!':
11765                 av_push(stack, newSVuv(curchar));
11766                 break;
11767
11768             case '(':
11769                 if (top_index >= 0) {
11770                     top_ptr = av_fetch(stack, top_index, FALSE);
11771                     assert(top_ptr);
11772                     if (IS_OPERAND(*top_ptr)) {
11773                         RExC_parse++;
11774                         vFAIL("Unexpected '(' with no preceding operator");
11775                     }
11776                 }
11777                 av_push(stack, newSVuv(curchar));
11778                 break;
11779
11780             case ')':
11781             {
11782                 SV* lparen;
11783                 if (top_index < 1
11784                     || ! (current = av_pop(stack))
11785                     || ! IS_OPERAND(current)
11786                     || ! (lparen = av_pop(stack))
11787                     || IS_OPERAND(lparen)
11788                     || SvUV(lparen) != '(')
11789                 {
11790                     RExC_parse++;
11791                     vFAIL("Unexpected ')'");
11792                 }
11793                 top_index -= 2;
11794                 SvREFCNT_dec_NN(lparen);
11795
11796                 /* FALL THROUGH */
11797             }
11798
11799               handle_operand:
11800
11801                 /* Here, we have an operand to process, in 'current' */
11802
11803                 if (top_index < 0) {    /* Just push if stack is empty */
11804                     av_push(stack, current);
11805                 }
11806                 else {
11807                     SV* top = av_pop(stack);
11808                     char current_operator;
11809
11810                     if (IS_OPERAND(top)) {
11811                         vFAIL("Operand with no preceding operator");
11812                     }
11813                     current_operator = (char) SvUV(top);
11814                     switch (current_operator) {
11815                         case '(':   /* Push the '(' back on followed by the new
11816                                        operand */
11817                             av_push(stack, top);
11818                             av_push(stack, current);
11819                             SvREFCNT_inc(top);  /* Counters the '_dec' done
11820                                                    just after the 'break', so
11821                                                    it doesn't get wrongly freed
11822                                                  */
11823                             break;
11824
11825                         case '!':
11826                             _invlist_invert(current);
11827
11828                             /* Unlike binary operators, the top of the stack,
11829                              * now that this unary one has been popped off, may
11830                              * legally be an operator, and we now have operand
11831                              * for it. */
11832                             top_index--;
11833                             SvREFCNT_dec_NN(top);
11834                             goto handle_operand;
11835
11836                         case '&':
11837                             _invlist_intersection(av_pop(stack),
11838                                                    current,
11839                                                    &current);
11840                             av_push(stack, current);
11841                             break;
11842
11843                         case '|':
11844                         case '+':
11845                             _invlist_union(av_pop(stack), current, &current);
11846                             av_push(stack, current);
11847                             break;
11848
11849                         case '-':
11850                             _invlist_subtract(av_pop(stack), current, &current);
11851                             av_push(stack, current);
11852                             break;
11853
11854                         case '^':   /* The union minus the intersection */
11855                         {
11856                             SV* i = NULL;
11857                             SV* u = NULL;
11858                             SV* element;
11859
11860                             element = av_pop(stack);
11861                             _invlist_union(element, current, &u);
11862                             _invlist_intersection(element, current, &i);
11863                             _invlist_subtract(u, i, &current);
11864                             av_push(stack, current);
11865                             SvREFCNT_dec_NN(i);
11866                             SvREFCNT_dec_NN(u);
11867                             SvREFCNT_dec_NN(element);
11868                             break;
11869                         }
11870
11871                         default:
11872                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11873                 }
11874                 SvREFCNT_dec_NN(top);
11875             }
11876         }
11877
11878         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11879     }
11880
11881     if (av_tindex(stack) < 0   /* Was empty */
11882         || ((final = av_pop(stack)) == NULL)
11883         || ! IS_OPERAND(final)
11884         || av_tindex(stack) >= 0)  /* More left on stack */
11885     {
11886         vFAIL("Incomplete expression within '(?[ ])'");
11887     }
11888
11889     /* Here, 'final' is the resultant inversion list from evaluating the
11890      * expression.  Return it if so requested */
11891     if (return_invlist) {
11892         *return_invlist = final;
11893         return END;
11894     }
11895
11896     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
11897      * expecting a string of ranges and individual code points */
11898     invlist_iterinit(final);
11899     result_string = newSVpvs("");
11900     while (invlist_iternext(final, &start, &end)) {
11901         if (start == end) {
11902             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11903         }
11904         else {
11905             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11906                                                      start,          end);
11907         }
11908     }
11909
11910     save_parse = RExC_parse;
11911     RExC_parse = SvPV(result_string, len);
11912     save_end = RExC_end;
11913     RExC_end = RExC_parse + len;
11914
11915     /* We turn off folding around the call, as the class we have constructed
11916      * already has all folding taken into consideration, and we don't want
11917      * regclass() to add to that */
11918     RExC_flags &= ~RXf_PMf_FOLD;
11919     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
11920      */
11921     node = regclass(pRExC_state, flagp,depth+1,
11922                     FALSE, /* means parse the whole char class */
11923                     FALSE, /* don't allow multi-char folds */
11924                     TRUE, /* silence non-portable warnings.  The above may very
11925                              well have generated non-portable code points, but
11926                              they're valid on this machine */
11927                     NULL);
11928     if (!node)
11929         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
11930                     PTR2UV(flagp));
11931     if (save_fold) {
11932         RExC_flags |= RXf_PMf_FOLD;
11933     }
11934     RExC_parse = save_parse + 1;
11935     RExC_end = save_end;
11936     SvREFCNT_dec_NN(final);
11937     SvREFCNT_dec_NN(result_string);
11938     SvREFCNT_dec_NN(stack);
11939
11940     nextchar(pRExC_state);
11941     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11942     return node;
11943 }
11944 #undef IS_OPERAND
11945
11946 /* The names of properties whose definitions are not known at compile time are
11947  * stored in this SV, after a constant heading.  So if the length has been
11948  * changed since initialization, then there is a run-time definition. */
11949 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11950
11951 STATIC regnode *
11952 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11953                  const bool stop_at_1,  /* Just parse the next thing, don't
11954                                            look for a full character class */
11955                  bool allow_multi_folds,
11956                  const bool silence_non_portable,   /* Don't output warnings
11957                                                        about too large
11958                                                        characters */
11959                  SV** ret_invlist)  /* Return an inversion list, not a node */
11960 {
11961     /* parse a bracketed class specification.  Most of these will produce an
11962      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
11963      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
11964      * under /i with multi-character folds: it will be rewritten following the
11965      * paradigm of this example, where the <multi-fold>s are characters which
11966      * fold to multiple character sequences:
11967      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11968      * gets effectively rewritten as:
11969      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11970      * reg() gets called (recursively) on the rewritten version, and this
11971      * function will return what it constructs.  (Actually the <multi-fold>s
11972      * aren't physically removed from the [abcdefghi], it's just that they are
11973      * ignored in the recursion by means of a flag:
11974      * <RExC_in_multi_char_class>.)
11975      *
11976      * ANYOF nodes contain a bit map for the first 256 characters, with the
11977      * corresponding bit set if that character is in the list.  For characters
11978      * above 255, a range list or swash is used.  There are extra bits for \w,
11979      * etc. in locale ANYOFs, as what these match is not determinable at
11980      * compile time
11981      *
11982      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
11983      * to be restarted.  This can only happen if ret_invlist is non-NULL.
11984      */
11985
11986     dVAR;
11987     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11988     IV range = 0;
11989     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11990     regnode *ret;
11991     STRLEN numlen;
11992     IV namedclass = OOB_NAMEDCLASS;
11993     char *rangebegin = NULL;
11994     bool need_class = 0;
11995     SV *listsv = NULL;
11996     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11997                                       than just initialized.  */
11998     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11999     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12000                                extended beyond the Latin1 range */
12001     UV element_count = 0;   /* Number of distinct elements in the class.
12002                                Optimizations may be possible if this is tiny */
12003     AV * multi_char_matches = NULL; /* Code points that fold to more than one
12004                                        character; used under /i */
12005     UV n;
12006     char * stop_ptr = RExC_end;    /* where to stop parsing */
12007     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12008                                                    space? */
12009     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12010
12011     /* Unicode properties are stored in a swash; this holds the current one
12012      * being parsed.  If this swash is the only above-latin1 component of the
12013      * character class, an optimization is to pass it directly on to the
12014      * execution engine.  Otherwise, it is set to NULL to indicate that there
12015      * are other things in the class that have to be dealt with at execution
12016      * time */
12017     SV* swash = NULL;           /* Code points that match \p{} \P{} */
12018
12019     /* Set if a component of this character class is user-defined; just passed
12020      * on to the engine */
12021     bool has_user_defined_property = FALSE;
12022
12023     /* inversion list of code points this node matches only when the target
12024      * string is in UTF-8.  (Because is under /d) */
12025     SV* depends_list = NULL;
12026
12027     /* inversion list of code points this node matches.  For much of the
12028      * function, it includes only those that match regardless of the utf8ness
12029      * of the target string */
12030     SV* cp_list = NULL;
12031
12032 #ifdef EBCDIC
12033     /* In a range, counts how many 0-2 of the ends of it came from literals,
12034      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12035     UV literal_endpoint = 0;
12036 #endif
12037     bool invert = FALSE;    /* Is this class to be complemented */
12038
12039     /* Is there any thing like \W or [:^digit:] that matches above the legal
12040      * Unicode range? */
12041     bool runtime_posix_matches_above_Unicode = FALSE;
12042
12043     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12044         case we need to change the emitted regop to an EXACT. */
12045     const char * orig_parse = RExC_parse;
12046     const I32 orig_size = RExC_size;
12047     GET_RE_DEBUG_FLAGS_DECL;
12048
12049     PERL_ARGS_ASSERT_REGCLASS;
12050 #ifndef DEBUGGING
12051     PERL_UNUSED_ARG(depth);
12052 #endif
12053
12054     DEBUG_PARSE("clas");
12055
12056     /* Assume we are going to generate an ANYOF node. */
12057     ret = reganode(pRExC_state, ANYOF, 0);
12058
12059     if (SIZE_ONLY) {
12060         RExC_size += ANYOF_SKIP;
12061         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12062     }
12063     else {
12064         ANYOF_FLAGS(ret) = 0;
12065
12066         RExC_emit += ANYOF_SKIP;
12067         if (LOC) {
12068             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12069         }
12070         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12071         initial_listsv_len = SvCUR(listsv);
12072         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12073     }
12074
12075     if (skip_white) {
12076         RExC_parse = regpatws(pRExC_state, RExC_parse,
12077                               FALSE /* means don't recognize comments */);
12078     }
12079
12080     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12081         RExC_parse++;
12082         invert = TRUE;
12083         allow_multi_folds = FALSE;
12084         RExC_naughty++;
12085         if (skip_white) {
12086             RExC_parse = regpatws(pRExC_state, RExC_parse,
12087                                   FALSE /* means don't recognize comments */);
12088         }
12089     }
12090
12091     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12092     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12093         const char *s = RExC_parse;
12094         const char  c = *s++;
12095
12096         while (isWORDCHAR(*s))
12097             s++;
12098         if (*s && c == *s && s[1] == ']') {
12099             SAVEFREESV(RExC_rx_sv);
12100             ckWARN3reg(s+2,
12101                        "POSIX syntax [%c %c] belongs inside character classes",
12102                        c, c);
12103             (void)ReREFCNT_inc(RExC_rx_sv);
12104         }
12105     }
12106
12107     /* If the caller wants us to just parse a single element, accomplish this
12108      * by faking the loop ending condition */
12109     if (stop_at_1 && RExC_end > RExC_parse) {
12110         stop_ptr = RExC_parse + 1;
12111     }
12112
12113     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12114     if (UCHARAT(RExC_parse) == ']')
12115         goto charclassloop;
12116
12117 parseit:
12118     while (1) {
12119         if  (RExC_parse >= stop_ptr) {
12120             break;
12121         }
12122
12123         if (skip_white) {
12124             RExC_parse = regpatws(pRExC_state, RExC_parse,
12125                                   FALSE /* means don't recognize comments */);
12126         }
12127
12128         if  (UCHARAT(RExC_parse) == ']') {
12129             break;
12130         }
12131
12132     charclassloop:
12133
12134         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12135         save_value = value;
12136         save_prevvalue = prevvalue;
12137
12138         if (!range) {
12139             rangebegin = RExC_parse;
12140             element_count++;
12141         }
12142         if (UTF) {
12143             value = utf8n_to_uvchr((U8*)RExC_parse,
12144                                    RExC_end - RExC_parse,
12145                                    &numlen, UTF8_ALLOW_DEFAULT);
12146             RExC_parse += numlen;
12147         }
12148         else
12149             value = UCHARAT(RExC_parse++);
12150
12151         if (value == '['
12152             && RExC_parse < RExC_end
12153             && POSIXCC(UCHARAT(RExC_parse)))
12154         {
12155             namedclass = regpposixcc(pRExC_state, value, strict);
12156         }
12157         else if (value == '\\') {
12158             if (UTF) {
12159                 value = utf8n_to_uvchr((U8*)RExC_parse,
12160                                    RExC_end - RExC_parse,
12161                                    &numlen, UTF8_ALLOW_DEFAULT);
12162                 RExC_parse += numlen;
12163             }
12164             else
12165                 value = UCHARAT(RExC_parse++);
12166
12167             /* Some compilers cannot handle switching on 64-bit integer
12168              * values, therefore value cannot be an UV.  Yes, this will
12169              * be a problem later if we want switch on Unicode.
12170              * A similar issue a little bit later when switching on
12171              * namedclass. --jhi */
12172
12173             /* If the \ is escaping white space when white space is being
12174              * skipped, it means that that white space is wanted literally, and
12175              * is already in 'value'.  Otherwise, need to translate the escape
12176              * into what it signifies. */
12177             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12178
12179             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12180             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12181             case 's':   namedclass = ANYOF_SPACE;       break;
12182             case 'S':   namedclass = ANYOF_NSPACE;      break;
12183             case 'd':   namedclass = ANYOF_DIGIT;       break;
12184             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12185             case 'v':   namedclass = ANYOF_VERTWS;      break;
12186             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12187             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12188             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12189             case 'N':  /* Handle \N{NAME} in class */
12190                 {
12191                     /* We only pay attention to the first char of
12192                     multichar strings being returned. I kinda wonder
12193                     if this makes sense as it does change the behaviour
12194                     from earlier versions, OTOH that behaviour was broken
12195                     as well. */
12196                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12197                                       TRUE, /* => charclass */
12198                                       strict))
12199                     {
12200                         if (*flagp & RESTART_UTF8)
12201                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
12202                         goto parseit;
12203                     }
12204                 }
12205                 break;
12206             case 'p':
12207             case 'P':
12208                 {
12209                 char *e;
12210
12211                 /* We will handle any undefined properties ourselves */
12212                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12213
12214                 if (RExC_parse >= RExC_end)
12215                     vFAIL2("Empty \\%c{}", (U8)value);
12216                 if (*RExC_parse == '{') {
12217                     const U8 c = (U8)value;
12218                     e = strchr(RExC_parse++, '}');
12219                     if (!e)
12220                         vFAIL2("Missing right brace on \\%c{}", c);
12221                     while (isSPACE(UCHARAT(RExC_parse)))
12222                         RExC_parse++;
12223                     if (e == RExC_parse)
12224                         vFAIL2("Empty \\%c{}", c);
12225                     n = e - RExC_parse;
12226                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12227                         n--;
12228                 }
12229                 else {
12230                     e = RExC_parse;
12231                     n = 1;
12232                 }
12233                 if (!SIZE_ONLY) {
12234                     SV* invlist;
12235                     char* name;
12236
12237                     if (UCHARAT(RExC_parse) == '^') {
12238                          RExC_parse++;
12239                          n--;
12240                          /* toggle.  (The rhs xor gets the single bit that
12241                           * differs between P and p; the other xor inverts just
12242                           * that bit) */
12243                          value ^= 'P' ^ 'p';
12244
12245                          while (isSPACE(UCHARAT(RExC_parse))) {
12246                               RExC_parse++;
12247                               n--;
12248                          }
12249                     }
12250                     /* Try to get the definition of the property into
12251                      * <invlist>.  If /i is in effect, the effective property
12252                      * will have its name be <__NAME_i>.  The design is
12253                      * discussed in commit
12254                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12255                     Newx(name, n + sizeof("_i__\n"), char);
12256
12257                     sprintf(name, "%s%.*s%s\n",
12258                                     (FOLD) ? "__" : "",
12259                                     (int)n,
12260                                     RExC_parse,
12261                                     (FOLD) ? "_i" : ""
12262                     );
12263
12264                     /* Look up the property name, and get its swash and
12265                      * inversion list, if the property is found  */
12266                     if (swash) {
12267                         SvREFCNT_dec_NN(swash);
12268                     }
12269                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12270                                              1, /* binary */
12271                                              0, /* not tr/// */
12272                                              NULL, /* No inversion list */
12273                                              &swash_init_flags
12274                                             );
12275                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12276                         if (swash) {
12277                             SvREFCNT_dec_NN(swash);
12278                             swash = NULL;
12279                         }
12280
12281                         /* Here didn't find it.  It could be a user-defined
12282                          * property that will be available at run-time.  If we
12283                          * accept only compile-time properties, is an error;
12284                          * otherwise add it to the list for run-time look up */
12285                         if (ret_invlist) {
12286                             RExC_parse = e + 1;
12287                             vFAIL3("Property '%.*s' is unknown", (int) n, name);
12288                         }
12289                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12290                                         (value == 'p' ? '+' : '!'),
12291                                         name);
12292                         has_user_defined_property = TRUE;
12293
12294                         /* We don't know yet, so have to assume that the
12295                          * property could match something in the Latin1 range,
12296                          * hence something that isn't utf8.  Note that this
12297                          * would cause things in <depends_list> to match
12298                          * inappropriately, except that any \p{}, including
12299                          * this one forces Unicode semantics, which means there
12300                          * is <no depends_list> */
12301                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12302                     }
12303                     else {
12304
12305                         /* Here, did get the swash and its inversion list.  If
12306                          * the swash is from a user-defined property, then this
12307                          * whole character class should be regarded as such */
12308                         has_user_defined_property =
12309                                     (swash_init_flags
12310                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12311
12312                         /* Invert if asking for the complement */
12313                         if (value == 'P') {
12314                             _invlist_union_complement_2nd(properties,
12315                                                           invlist,
12316                                                           &properties);
12317
12318                             /* The swash can't be used as-is, because we've
12319                              * inverted things; delay removing it to here after
12320                              * have copied its invlist above */
12321                             SvREFCNT_dec_NN(swash);
12322                             swash = NULL;
12323                         }
12324                         else {
12325                             _invlist_union(properties, invlist, &properties);
12326                         }
12327                     }
12328                     Safefree(name);
12329                 }
12330                 RExC_parse = e + 1;
12331                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12332                                                 named */
12333
12334                 /* \p means they want Unicode semantics */
12335                 RExC_uni_semantics = 1;
12336                 }
12337                 break;
12338             case 'n':   value = '\n';                   break;
12339             case 'r':   value = '\r';                   break;
12340             case 't':   value = '\t';                   break;
12341             case 'f':   value = '\f';                   break;
12342             case 'b':   value = '\b';                   break;
12343             case 'e':   value = ASCII_TO_NATIVE('\033');break;
12344             case 'a':   value = ASCII_TO_NATIVE('\007');break;
12345             case 'o':
12346                 RExC_parse--;   /* function expects to be pointed at the 'o' */
12347                 {
12348                     const char* error_msg;
12349                     bool valid = grok_bslash_o(&RExC_parse,
12350                                                &value,
12351                                                &error_msg,
12352                                                SIZE_ONLY,   /* warnings in pass
12353                                                                1 only */
12354                                                strict,
12355                                                silence_non_portable,
12356                                                UTF);
12357                     if (! valid) {
12358                         vFAIL(error_msg);
12359                     }
12360                 }
12361                 if (PL_encoding && value < 0x100) {
12362                     goto recode_encoding;
12363                 }
12364                 break;
12365             case 'x':
12366                 RExC_parse--;   /* function expects to be pointed at the 'x' */
12367                 {
12368                     const char* error_msg;
12369                     bool valid = grok_bslash_x(&RExC_parse,
12370                                                &value,
12371                                                &error_msg,
12372                                                TRUE, /* Output warnings */
12373                                                strict,
12374                                                silence_non_portable,
12375                                                UTF);
12376                     if (! valid) {
12377                         vFAIL(error_msg);
12378                     }
12379                 }
12380                 if (PL_encoding && value < 0x100)
12381                     goto recode_encoding;
12382                 break;
12383             case 'c':
12384                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12385                 break;
12386             case '0': case '1': case '2': case '3': case '4':
12387             case '5': case '6': case '7':
12388                 {
12389                     /* Take 1-3 octal digits */
12390                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12391                     numlen = (strict) ? 4 : 3;
12392                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12393                     RExC_parse += numlen;
12394                     if (numlen != 3) {
12395                         if (strict) {
12396                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12397                             vFAIL("Need exactly 3 octal digits");
12398                         }
12399                         else if (! SIZE_ONLY /* like \08, \178 */
12400                                  && numlen < 3
12401                                  && RExC_parse < RExC_end
12402                                  && isDIGIT(*RExC_parse)
12403                                  && ckWARN(WARN_REGEXP))
12404                         {
12405                             SAVEFREESV(RExC_rx_sv);
12406                             reg_warn_non_literal_string(
12407                                  RExC_parse + 1,
12408                                  form_short_octal_warning(RExC_parse, numlen));
12409                             (void)ReREFCNT_inc(RExC_rx_sv);
12410                         }
12411                     }
12412                     if (PL_encoding && value < 0x100)
12413                         goto recode_encoding;
12414                     break;
12415                 }
12416             recode_encoding:
12417                 if (! RExC_override_recoding) {
12418                     SV* enc = PL_encoding;
12419                     value = reg_recode((const char)(U8)value, &enc);
12420                     if (!enc) {
12421                         if (strict) {
12422                             vFAIL("Invalid escape in the specified encoding");
12423                         }
12424                         else if (SIZE_ONLY) {
12425                             ckWARNreg(RExC_parse,
12426                                   "Invalid escape in the specified encoding");
12427                         }
12428                     }
12429                     break;
12430                 }
12431             default:
12432                 /* Allow \_ to not give an error */
12433                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12434                     if (strict) {
12435                         vFAIL2("Unrecognized escape \\%c in character class",
12436                                (int)value);
12437                     }
12438                     else {
12439                         SAVEFREESV(RExC_rx_sv);
12440                         ckWARN2reg(RExC_parse,
12441                             "Unrecognized escape \\%c in character class passed through",
12442                             (int)value);
12443                         (void)ReREFCNT_inc(RExC_rx_sv);
12444                     }
12445                 }
12446                 break;
12447             }   /* End of switch on char following backslash */
12448         } /* end of handling backslash escape sequences */
12449 #ifdef EBCDIC
12450         else
12451             literal_endpoint++;
12452 #endif
12453
12454         /* Here, we have the current token in 'value' */
12455
12456         /* What matches in a locale is not known until runtime.  This includes
12457          * what the Posix classes (like \w, [:space:]) match.  Room must be
12458          * reserved (one time per class) to store such classes, either if Perl
12459          * is compiled so that locale nodes always should have this space, or
12460          * if there is such class info to be stored.  The space will contain a
12461          * bit for each named class that is to be matched against.  This isn't
12462          * needed for \p{} and pseudo-classes, as they are not affected by
12463          * locale, and hence are dealt with separately */
12464         if (LOC
12465             && ! need_class
12466             && (ANYOF_LOCALE == ANYOF_CLASS
12467                 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12468         {
12469             need_class = 1;
12470             if (SIZE_ONLY) {
12471                 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12472             }
12473             else {
12474                 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12475                 ANYOF_CLASS_ZERO(ret);
12476             }
12477             ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12478         }
12479
12480         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12481
12482             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12483              * literal, as is the character that began the false range, i.e.
12484              * the 'a' in the examples */
12485             if (range) {
12486                 if (!SIZE_ONLY) {
12487                     const int w = (RExC_parse >= rangebegin)
12488                                   ? RExC_parse - rangebegin
12489                                   : 0;
12490                     if (strict) {
12491                         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12492                     }
12493                     else {
12494                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12495                         ckWARN4reg(RExC_parse,
12496                                 "False [] range \"%*.*s\"",
12497                                 w, w, rangebegin);
12498                         (void)ReREFCNT_inc(RExC_rx_sv);
12499                         cp_list = add_cp_to_invlist(cp_list, '-');
12500                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12501                     }
12502                 }
12503
12504                 range = 0; /* this was not a true range */
12505                 element_count += 2; /* So counts for three values */
12506             }
12507
12508             if (! SIZE_ONLY) {
12509                 U8 classnum = namedclass_to_classnum(namedclass);
12510                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
12511                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12512
12513                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12514                          * /l make a difference in what these match.  There
12515                          * would be problems if these characters had folds
12516                          * other than themselves, as cp_list is subject to
12517                          * folding. */
12518                         if (classnum != _CC_VERTSPACE) {
12519                             assert(   namedclass == ANYOF_HORIZWS
12520                                    || namedclass == ANYOF_NHORIZWS);
12521
12522                             /* It turns out that \h is just a synonym for
12523                              * XPosixBlank */
12524                             classnum = _CC_BLANK;
12525                         }
12526
12527                         _invlist_union_maybe_complement_2nd(
12528                                 cp_list,
12529                                 PL_XPosix_ptrs[classnum],
12530                                 cBOOL(namedclass % 2), /* Complement if odd
12531                                                           (NHORIZWS, NVERTWS)
12532                                                         */
12533                                 &cp_list);
12534                     }
12535                 }
12536                 else if (classnum == _CC_ASCII) {
12537 #ifdef HAS_ISASCII
12538                     if (LOC) {
12539                         ANYOF_CLASS_SET(ret, namedclass);
12540                     }
12541                     else
12542 #endif  /* Not isascii(); just use the hard-coded definition for it */
12543                         _invlist_union_maybe_complement_2nd(
12544                                 posixes,
12545                                 PL_ASCII,
12546                                 cBOOL(namedclass % 2), /* Complement if odd
12547                                                           (NASCII) */
12548                                 &posixes);
12549                 }
12550                 else {  /* Garden variety class */
12551
12552                     /* The ascii range inversion list */
12553                     SV* ascii_source = PL_Posix_ptrs[classnum];
12554
12555                     /* The full Latin1 range inversion list */
12556                     SV* l1_source = PL_L1Posix_ptrs[classnum];
12557
12558                     /* This code is structured into two major clauses.  The
12559                      * first is for classes whose complete definitions may not
12560                      * already be known.  It not, the Latin1 definition
12561                      * (guaranteed to already known) is used plus code is
12562                      * generated to load the rest at run-time (only if needed).
12563                      * If the complete definition is known, it drops down to
12564                      * the second clause, where the complete definition is
12565                      * known */
12566
12567                     if (classnum < _FIRST_NON_SWASH_CC) {
12568
12569                         /* Here, the class has a swash, which may or not
12570                          * already be loaded */
12571
12572                         /* The name of the property to use to match the full
12573                          * eXtended Unicode range swash for this character
12574                          * class */
12575                         const char *Xname = swash_property_names[classnum];
12576
12577                         /* If returning the inversion list, we can't defer
12578                          * getting this until runtime */
12579                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12580                             PL_utf8_swash_ptrs[classnum] =
12581                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
12582                                              1, /* binary */
12583                                              0, /* not tr/// */
12584                                              NULL, /* No inversion list */
12585                                              NULL  /* No flags */
12586                                             );
12587                             assert(PL_utf8_swash_ptrs[classnum]);
12588                         }
12589                         if ( !  PL_utf8_swash_ptrs[classnum]) {
12590                             if (namedclass % 2 == 0) { /* A non-complemented
12591                                                           class */
12592                                 /* If not /a matching, there are code points we
12593                                  * don't know at compile time.  Arrange for the
12594                                  * unknown matches to be loaded at run-time, if
12595                                  * needed */
12596                                 if (! AT_LEAST_ASCII_RESTRICTED) {
12597                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12598                                                                  Xname);
12599                                 }
12600                                 if (LOC) {  /* Under locale, set run-time
12601                                                lookup */
12602                                     ANYOF_CLASS_SET(ret, namedclass);
12603                                 }
12604                                 else {
12605                                     /* Add the current class's code points to
12606                                      * the running total */
12607                                     _invlist_union(posixes,
12608                                                    (AT_LEAST_ASCII_RESTRICTED)
12609                                                         ? ascii_source
12610                                                         : l1_source,
12611                                                    &posixes);
12612                                 }
12613                             }
12614                             else {  /* A complemented class */
12615                                 if (AT_LEAST_ASCII_RESTRICTED) {
12616                                     /* Under /a should match everything above
12617                                      * ASCII, plus the complement of the set's
12618                                      * ASCII matches */
12619                                     _invlist_union_complement_2nd(posixes,
12620                                                                   ascii_source,
12621                                                                   &posixes);
12622                                 }
12623                                 else {
12624                                     /* Arrange for the unknown matches to be
12625                                      * loaded at run-time, if needed */
12626                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12627                                                                  Xname);
12628                                     runtime_posix_matches_above_Unicode = TRUE;
12629                                     if (LOC) {
12630                                         ANYOF_CLASS_SET(ret, namedclass);
12631                                     }
12632                                     else {
12633
12634                                         /* We want to match everything in
12635                                          * Latin1, except those things that
12636                                          * l1_source matches */
12637                                         SV* scratch_list = NULL;
12638                                         _invlist_subtract(PL_Latin1, l1_source,
12639                                                           &scratch_list);
12640
12641                                         /* Add the list from this class to the
12642                                          * running total */
12643                                         if (! posixes) {
12644                                             posixes = scratch_list;
12645                                         }
12646                                         else {
12647                                             _invlist_union(posixes,
12648                                                            scratch_list,
12649                                                            &posixes);
12650                                             SvREFCNT_dec_NN(scratch_list);
12651                                         }
12652                                         if (DEPENDS_SEMANTICS) {
12653                                             ANYOF_FLAGS(ret)
12654                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
12655                                         }
12656                                     }
12657                                 }
12658                             }
12659                             goto namedclass_done;
12660                         }
12661
12662                         /* Here, there is a swash loaded for the class.  If no
12663                          * inversion list for it yet, get it */
12664                         if (! PL_XPosix_ptrs[classnum]) {
12665                             PL_XPosix_ptrs[classnum]
12666                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12667                         }
12668                     }
12669
12670                     /* Here there is an inversion list already loaded for the
12671                      * entire class */
12672
12673                     if (namedclass % 2 == 0) {  /* A non-complemented class,
12674                                                    like ANYOF_PUNCT */
12675                         if (! LOC) {
12676                             /* For non-locale, just add it to any existing list
12677                              * */
12678                             _invlist_union(posixes,
12679                                            (AT_LEAST_ASCII_RESTRICTED)
12680                                                ? ascii_source
12681                                                : PL_XPosix_ptrs[classnum],
12682                                            &posixes);
12683                         }
12684                         else {  /* Locale */
12685                             SV* scratch_list = NULL;
12686
12687                             /* For above Latin1 code points, we use the full
12688                              * Unicode range */
12689                             _invlist_intersection(PL_AboveLatin1,
12690                                                   PL_XPosix_ptrs[classnum],
12691                                                   &scratch_list);
12692                             /* And set the output to it, adding instead if
12693                              * there already is an output.  Checking if
12694                              * 'posixes' is NULL first saves an extra clone.
12695                              * Its reference count will be decremented at the
12696                              * next union, etc, or if this is the only
12697                              * instance, at the end of the routine */
12698                             if (! posixes) {
12699                                 posixes = scratch_list;
12700                             }
12701                             else {
12702                                 _invlist_union(posixes, scratch_list, &posixes);
12703                                 SvREFCNT_dec_NN(scratch_list);
12704                             }
12705
12706 #ifndef HAS_ISBLANK
12707                             if (namedclass != ANYOF_BLANK) {
12708 #endif
12709                                 /* Set this class in the node for runtime
12710                                  * matching */
12711                                 ANYOF_CLASS_SET(ret, namedclass);
12712 #ifndef HAS_ISBLANK
12713                             }
12714                             else {
12715                                 /* No isblank(), use the hard-coded ASCII-range
12716                                  * blanks, adding them to the running total. */
12717
12718                                 _invlist_union(posixes, ascii_source, &posixes);
12719                             }
12720 #endif
12721                         }
12722                     }
12723                     else {  /* A complemented class, like ANYOF_NPUNCT */
12724                         if (! LOC) {
12725                             _invlist_union_complement_2nd(
12726                                                 posixes,
12727                                                 (AT_LEAST_ASCII_RESTRICTED)
12728                                                     ? ascii_source
12729                                                     : PL_XPosix_ptrs[classnum],
12730                                                 &posixes);
12731                             /* Under /d, everything in the upper half of the
12732                              * Latin1 range matches this complement */
12733                             if (DEPENDS_SEMANTICS) {
12734                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12735                             }
12736                         }
12737                         else {  /* Locale */
12738                             SV* scratch_list = NULL;
12739                             _invlist_subtract(PL_AboveLatin1,
12740                                               PL_XPosix_ptrs[classnum],
12741                                               &scratch_list);
12742                             if (! posixes) {
12743                                 posixes = scratch_list;
12744                             }
12745                             else {
12746                                 _invlist_union(posixes, scratch_list, &posixes);
12747                                 SvREFCNT_dec_NN(scratch_list);
12748                             }
12749 #ifndef HAS_ISBLANK
12750                             if (namedclass != ANYOF_NBLANK) {
12751 #endif
12752                                 ANYOF_CLASS_SET(ret, namedclass);
12753 #ifndef HAS_ISBLANK
12754                             }
12755                             else {
12756                                 /* Get the list of all code points in Latin1
12757                                  * that are not ASCII blanks, and add them to
12758                                  * the running total */
12759                                 _invlist_subtract(PL_Latin1, ascii_source,
12760                                                   &scratch_list);
12761                                 _invlist_union(posixes, scratch_list, &posixes);
12762                                 SvREFCNT_dec_NN(scratch_list);
12763                             }
12764 #endif
12765                         }
12766                     }
12767                 }
12768               namedclass_done:
12769                 continue;   /* Go get next character */
12770             }
12771         } /* end of namedclass \blah */
12772
12773         /* Here, we have a single value.  If 'range' is set, it is the ending
12774          * of a range--check its validity.  Later, we will handle each
12775          * individual code point in the range.  If 'range' isn't set, this
12776          * could be the beginning of a range, so check for that by looking
12777          * ahead to see if the next real character to be processed is the range
12778          * indicator--the minus sign */
12779
12780         if (skip_white) {
12781             RExC_parse = regpatws(pRExC_state, RExC_parse,
12782                                 FALSE /* means don't recognize comments */);
12783         }
12784
12785         if (range) {
12786             if (prevvalue > value) /* b-a */ {
12787                 const int w = RExC_parse - rangebegin;
12788                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12789                 range = 0; /* not a valid range */
12790             }
12791         }
12792         else {
12793             prevvalue = value; /* save the beginning of the potential range */
12794             if (! stop_at_1     /* Can't be a range if parsing just one thing */
12795                 && *RExC_parse == '-')
12796             {
12797                 char* next_char_ptr = RExC_parse + 1;
12798                 if (skip_white) {   /* Get the next real char after the '-' */
12799                     next_char_ptr = regpatws(pRExC_state,
12800                                              RExC_parse + 1,
12801                                              FALSE); /* means don't recognize
12802                                                         comments */
12803                 }
12804
12805                 /* If the '-' is at the end of the class (just before the ']',
12806                  * it is a literal minus; otherwise it is a range */
12807                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12808                     RExC_parse = next_char_ptr;
12809
12810                     /* a bad range like \w-, [:word:]- ? */
12811                     if (namedclass > OOB_NAMEDCLASS) {
12812                         if (strict || ckWARN(WARN_REGEXP)) {
12813                             const int w =
12814                                 RExC_parse >= rangebegin ?
12815                                 RExC_parse - rangebegin : 0;
12816                             if (strict) {
12817                                 vFAIL4("False [] range \"%*.*s\"",
12818                                     w, w, rangebegin);
12819                             }
12820                             else {
12821                                 vWARN4(RExC_parse,
12822                                     "False [] range \"%*.*s\"",
12823                                     w, w, rangebegin);
12824                             }
12825                         }
12826                         if (!SIZE_ONLY) {
12827                             cp_list = add_cp_to_invlist(cp_list, '-');
12828                         }
12829                         element_count++;
12830                     } else
12831                         range = 1;      /* yeah, it's a range! */
12832                     continue;   /* but do it the next time */
12833                 }
12834             }
12835         }
12836
12837         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12838          * if not */
12839
12840         /* non-Latin1 code point implies unicode semantics.  Must be set in
12841          * pass1 so is there for the whole of pass 2 */
12842         if (value > 255) {
12843             RExC_uni_semantics = 1;
12844         }
12845
12846         /* Ready to process either the single value, or the completed range.
12847          * For single-valued non-inverted ranges, we consider the possibility
12848          * of multi-char folds.  (We made a conscious decision to not do this
12849          * for the other cases because it can often lead to non-intuitive
12850          * results.  For example, you have the peculiar case that:
12851          *  "s s" =~ /^[^\xDF]+$/i => Y
12852          *  "ss"  =~ /^[^\xDF]+$/i => N
12853          *
12854          * See [perl #89750] */
12855         if (FOLD && allow_multi_folds && value == prevvalue) {
12856             if (value == LATIN_SMALL_LETTER_SHARP_S
12857                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12858                                                         value)))
12859             {
12860                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12861
12862                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12863                 STRLEN foldlen;
12864
12865                 UV folded = _to_uni_fold_flags(
12866                                 value,
12867                                 foldbuf,
12868                                 &foldlen,
12869                                 FOLD_FLAGS_FULL
12870                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12871                                             : (ASCII_FOLD_RESTRICTED)
12872                                               ? FOLD_FLAGS_NOMIX_ASCII
12873                                               : 0)
12874                                 );
12875
12876                 /* Here, <folded> should be the first character of the
12877                  * multi-char fold of <value>, with <foldbuf> containing the
12878                  * whole thing.  But, if this fold is not allowed (because of
12879                  * the flags), <fold> will be the same as <value>, and should
12880                  * be processed like any other character, so skip the special
12881                  * handling */
12882                 if (folded != value) {
12883
12884                     /* Skip if we are recursed, currently parsing the class
12885                      * again.  Otherwise add this character to the list of
12886                      * multi-char folds. */
12887                     if (! RExC_in_multi_char_class) {
12888                         AV** this_array_ptr;
12889                         AV* this_array;
12890                         STRLEN cp_count = utf8_length(foldbuf,
12891                                                       foldbuf + foldlen);
12892                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12893
12894                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12895
12896
12897                         if (! multi_char_matches) {
12898                             multi_char_matches = newAV();
12899                         }
12900
12901                         /* <multi_char_matches> is actually an array of arrays.
12902                          * There will be one or two top-level elements: [2],
12903                          * and/or [3].  The [2] element is an array, each
12904                          * element thereof is a character which folds to two
12905                          * characters; likewise for [3].  (Unicode guarantees a
12906                          * maximum of 3 characters in any fold.)  When we
12907                          * rewrite the character class below, we will do so
12908                          * such that the longest folds are written first, so
12909                          * that it prefers the longest matching strings first.
12910                          * This is done even if it turns out that any
12911                          * quantifier is non-greedy, out of programmer
12912                          * laziness.  Tom Christiansen has agreed that this is
12913                          * ok.  This makes the test for the ligature 'ffi' come
12914                          * before the test for 'ff' */
12915                         if (av_exists(multi_char_matches, cp_count)) {
12916                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12917                                                              cp_count, FALSE);
12918                             this_array = *this_array_ptr;
12919                         }
12920                         else {
12921                             this_array = newAV();
12922                             av_store(multi_char_matches, cp_count,
12923                                      (SV*) this_array);
12924                         }
12925                         av_push(this_array, multi_fold);
12926                     }
12927
12928                     /* This element should not be processed further in this
12929                      * class */
12930                     element_count--;
12931                     value = save_value;
12932                     prevvalue = save_prevvalue;
12933                     continue;
12934                 }
12935             }
12936         }
12937
12938         /* Deal with this element of the class */
12939         if (! SIZE_ONLY) {
12940 #ifndef EBCDIC
12941             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12942 #else
12943             SV* this_range = _new_invlist(1);
12944             _append_range_to_invlist(this_range, prevvalue, value);
12945
12946             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12947              * If this range was specified using something like 'i-j', we want
12948              * to include only the 'i' and the 'j', and not anything in
12949              * between, so exclude non-ASCII, non-alphabetics from it.
12950              * However, if the range was specified with something like
12951              * [\x89-\x91] or [\x89-j], all code points within it should be
12952              * included.  literal_endpoint==2 means both ends of the range used
12953              * a literal character, not \x{foo} */
12954             if (literal_endpoint == 2
12955                 && (prevvalue >= 'a' && value <= 'z')
12956                     || (prevvalue >= 'A' && value <= 'Z'))
12957             {
12958                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
12959                                       &this_range);
12960             }
12961             _invlist_union(cp_list, this_range, &cp_list);
12962             literal_endpoint = 0;
12963 #endif
12964         }
12965
12966         range = 0; /* this range (if it was one) is done now */
12967     } /* End of loop through all the text within the brackets */
12968
12969     /* If anything in the class expands to more than one character, we have to
12970      * deal with them by building up a substitute parse string, and recursively
12971      * calling reg() on it, instead of proceeding */
12972     if (multi_char_matches) {
12973         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12974         I32 cp_count;
12975         STRLEN len;
12976         char *save_end = RExC_end;
12977         char *save_parse = RExC_parse;
12978         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12979                                        a "|" */
12980         I32 reg_flags;
12981
12982         assert(! invert);
12983 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12984            because too confusing */
12985         if (invert) {
12986             sv_catpv(substitute_parse, "(?:");
12987         }
12988 #endif
12989
12990         /* Look at the longest folds first */
12991         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12992
12993             if (av_exists(multi_char_matches, cp_count)) {
12994                 AV** this_array_ptr;
12995                 SV* this_sequence;
12996
12997                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12998                                                  cp_count, FALSE);
12999                 while ((this_sequence = av_pop(*this_array_ptr)) !=
13000                                                                 &PL_sv_undef)
13001                 {
13002                     if (! first_time) {
13003                         sv_catpv(substitute_parse, "|");
13004                     }
13005                     first_time = FALSE;
13006
13007                     sv_catpv(substitute_parse, SvPVX(this_sequence));
13008                 }
13009             }
13010         }
13011
13012         /* If the character class contains anything else besides these
13013          * multi-character folds, have to include it in recursive parsing */
13014         if (element_count) {
13015             sv_catpv(substitute_parse, "|[");
13016             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13017             sv_catpv(substitute_parse, "]");
13018         }
13019
13020         sv_catpv(substitute_parse, ")");
13021 #if 0
13022         if (invert) {
13023             /* This is a way to get the parse to skip forward a whole named
13024              * sequence instead of matching the 2nd character when it fails the
13025              * first */
13026             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13027         }
13028 #endif
13029
13030         RExC_parse = SvPV(substitute_parse, len);
13031         RExC_end = RExC_parse + len;
13032         RExC_in_multi_char_class = 1;
13033         RExC_emit = (regnode *)orig_emit;
13034
13035         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13036
13037         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13038
13039         RExC_parse = save_parse;
13040         RExC_end = save_end;
13041         RExC_in_multi_char_class = 0;
13042         SvREFCNT_dec_NN(multi_char_matches);
13043         return ret;
13044     }
13045
13046     /* If the character class contains only a single element, it may be
13047      * optimizable into another node type which is smaller and runs faster.
13048      * Check if this is the case for this class */
13049     if (element_count == 1 && ! ret_invlist) {
13050         U8 op = END;
13051         U8 arg = 0;
13052
13053         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13054                                               [:digit:] or \p{foo} */
13055
13056             /* All named classes are mapped into POSIXish nodes, with its FLAG
13057              * argument giving which class it is */
13058             switch ((I32)namedclass) {
13059                 case ANYOF_UNIPROP:
13060                     break;
13061
13062                 /* These don't depend on the charset modifiers.  They always
13063                  * match under /u rules */
13064                 case ANYOF_NHORIZWS:
13065                 case ANYOF_HORIZWS:
13066                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13067                     /* FALLTHROUGH */
13068
13069                 case ANYOF_NVERTWS:
13070                 case ANYOF_VERTWS:
13071                     op = POSIXU;
13072                     goto join_posix;
13073
13074                 /* The actual POSIXish node for all the rest depends on the
13075                  * charset modifier.  The ones in the first set depend only on
13076                  * ASCII or, if available on this platform, locale */
13077                 case ANYOF_ASCII:
13078                 case ANYOF_NASCII:
13079 #ifdef HAS_ISASCII
13080                     op = (LOC) ? POSIXL : POSIXA;
13081 #else
13082                     op = POSIXA;
13083 #endif
13084                     goto join_posix;
13085
13086                 case ANYOF_NCASED:
13087                 case ANYOF_LOWER:
13088                 case ANYOF_NLOWER:
13089                 case ANYOF_UPPER:
13090                 case ANYOF_NUPPER:
13091                     /* under /a could be alpha */
13092                     if (FOLD) {
13093                         if (ASCII_RESTRICTED) {
13094                             namedclass = ANYOF_ALPHA + (namedclass % 2);
13095                         }
13096                         else if (! LOC) {
13097                             break;
13098                         }
13099                     }
13100                     /* FALLTHROUGH */
13101
13102                 /* The rest have more possibilities depending on the charset.
13103                  * We take advantage of the enum ordering of the charset
13104                  * modifiers to get the exact node type, */
13105                 default:
13106                     op = POSIXD + get_regex_charset(RExC_flags);
13107                     if (op > POSIXA) { /* /aa is same as /a */
13108                         op = POSIXA;
13109                     }
13110 #ifndef HAS_ISBLANK
13111                     if (op == POSIXL
13112                         && (namedclass == ANYOF_BLANK
13113                             || namedclass == ANYOF_NBLANK))
13114                     {
13115                         op = POSIXA;
13116                     }
13117 #endif
13118
13119                 join_posix:
13120                     /* The odd numbered ones are the complements of the
13121                      * next-lower even number one */
13122                     if (namedclass % 2 == 1) {
13123                         invert = ! invert;
13124                         namedclass--;
13125                     }
13126                     arg = namedclass_to_classnum(namedclass);
13127                     break;
13128             }
13129         }
13130         else if (value == prevvalue) {
13131
13132             /* Here, the class consists of just a single code point */
13133
13134             if (invert) {
13135                 if (! LOC && value == '\n') {
13136                     op = REG_ANY; /* Optimize [^\n] */
13137                     *flagp |= HASWIDTH|SIMPLE;
13138                     RExC_naughty++;
13139                 }
13140             }
13141             else if (value < 256 || UTF) {
13142
13143                 /* Optimize a single value into an EXACTish node, but not if it
13144                  * would require converting the pattern to UTF-8. */
13145                 op = compute_EXACTish(pRExC_state);
13146             }
13147         } /* Otherwise is a range */
13148         else if (! LOC) {   /* locale could vary these */
13149             if (prevvalue == '0') {
13150                 if (value == '9') {
13151                     arg = _CC_DIGIT;
13152                     op = POSIXA;
13153                 }
13154             }
13155         }
13156
13157         /* Here, we have changed <op> away from its initial value iff we found
13158          * an optimization */
13159         if (op != END) {
13160
13161             /* Throw away this ANYOF regnode, and emit the calculated one,
13162              * which should correspond to the beginning, not current, state of
13163              * the parse */
13164             const char * cur_parse = RExC_parse;
13165             RExC_parse = (char *)orig_parse;
13166             if ( SIZE_ONLY) {
13167                 if (! LOC) {
13168
13169                     /* To get locale nodes to not use the full ANYOF size would
13170                      * require moving the code above that writes the portions
13171                      * of it that aren't in other nodes to after this point.
13172                      * e.g.  ANYOF_CLASS_SET */
13173                     RExC_size = orig_size;
13174                 }
13175             }
13176             else {
13177                 RExC_emit = (regnode *)orig_emit;
13178                 if (PL_regkind[op] == POSIXD) {
13179                     if (invert) {
13180                         op += NPOSIXD - POSIXD;
13181                     }
13182                 }
13183             }
13184
13185             ret = reg_node(pRExC_state, op);
13186
13187             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13188                 if (! SIZE_ONLY) {
13189                     FLAGS(ret) = arg;
13190                 }
13191                 *flagp |= HASWIDTH|SIMPLE;
13192             }
13193             else if (PL_regkind[op] == EXACT) {
13194                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13195             }
13196
13197             RExC_parse = (char *) cur_parse;
13198
13199             SvREFCNT_dec(posixes);
13200             SvREFCNT_dec(cp_list);
13201             return ret;
13202         }
13203     }
13204
13205     if (SIZE_ONLY)
13206         return ret;
13207     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13208
13209     /* If folding, we calculate all characters that could fold to or from the
13210      * ones already on the list */
13211     if (FOLD && cp_list) {
13212         UV start, end;  /* End points of code point ranges */
13213
13214         SV* fold_intersection = NULL;
13215
13216         /* If the highest code point is within Latin1, we can use the
13217          * compiled-in Alphas list, and not have to go out to disk.  This
13218          * yields two false positives, the masculine and feminine ordinal
13219          * indicators, which are weeded out below using the
13220          * IS_IN_SOME_FOLD_L1() macro */
13221         if (invlist_highest(cp_list) < 256) {
13222             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13223                                                            &fold_intersection);
13224         }
13225         else {
13226
13227             /* Here, there are non-Latin1 code points, so we will have to go
13228              * fetch the list of all the characters that participate in folds
13229              */
13230             if (! PL_utf8_foldable) {
13231                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13232                                        &PL_sv_undef, 1, 0);
13233                 PL_utf8_foldable = _get_swash_invlist(swash);
13234                 SvREFCNT_dec_NN(swash);
13235             }
13236
13237             /* This is a hash that for a particular fold gives all characters
13238              * that are involved in it */
13239             if (! PL_utf8_foldclosures) {
13240
13241                 /* If we were unable to find any folds, then we likely won't be
13242                  * able to find the closures.  So just create an empty list.
13243                  * Folding will effectively be restricted to the non-Unicode
13244                  * rules hard-coded into Perl.  (This case happens legitimately
13245                  * during compilation of Perl itself before the Unicode tables
13246                  * are generated) */
13247                 if (_invlist_len(PL_utf8_foldable) == 0) {
13248                     PL_utf8_foldclosures = newHV();
13249                 }
13250                 else {
13251                     /* If the folds haven't been read in, call a fold function
13252                      * to force that */
13253                     if (! PL_utf8_tofold) {
13254                         U8 dummy[UTF8_MAXBYTES+1];
13255
13256                         /* This string is just a short named one above \xff */
13257                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13258                         assert(PL_utf8_tofold); /* Verify that worked */
13259                     }
13260                     PL_utf8_foldclosures =
13261                                     _swash_inversion_hash(PL_utf8_tofold);
13262                 }
13263             }
13264
13265             /* Only the characters in this class that participate in folds need
13266              * be checked.  Get the intersection of this class and all the
13267              * possible characters that are foldable.  This can quickly narrow
13268              * down a large class */
13269             _invlist_intersection(PL_utf8_foldable, cp_list,
13270                                   &fold_intersection);
13271         }
13272
13273         /* Now look at the foldable characters in this class individually */
13274         invlist_iterinit(fold_intersection);
13275         while (invlist_iternext(fold_intersection, &start, &end)) {
13276             UV j;
13277
13278             /* Locale folding for Latin1 characters is deferred until runtime */
13279             if (LOC && start < 256) {
13280                 start = 256;
13281             }
13282
13283             /* Look at every character in the range */
13284             for (j = start; j <= end; j++) {
13285
13286                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13287                 STRLEN foldlen;
13288                 SV** listp;
13289
13290                 if (j < 256) {
13291
13292                     /* We have the latin1 folding rules hard-coded here so that
13293                      * an innocent-looking character class, like /[ks]/i won't
13294                      * have to go out to disk to find the possible matches.
13295                      * XXX It would be better to generate these via regen, in
13296                      * case a new version of the Unicode standard adds new
13297                      * mappings, though that is not really likely, and may be
13298                      * caught by the default: case of the switch below. */
13299
13300                     if (IS_IN_SOME_FOLD_L1(j)) {
13301
13302                         /* ASCII is always matched; non-ASCII is matched only
13303                          * under Unicode rules */
13304                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13305                             cp_list =
13306                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13307                         }
13308                         else {
13309                             depends_list =
13310                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13311                         }
13312                     }
13313
13314                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13315                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13316                     {
13317                         /* Certain Latin1 characters have matches outside
13318                          * Latin1.  To get here, <j> is one of those
13319                          * characters.   None of these matches is valid for
13320                          * ASCII characters under /aa, which is why the 'if'
13321                          * just above excludes those.  These matches only
13322                          * happen when the target string is utf8.  The code
13323                          * below adds the single fold closures for <j> to the
13324                          * inversion list. */
13325                         switch (j) {
13326                             case 'k':
13327                             case 'K':
13328                                 cp_list =
13329                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
13330                                 break;
13331                             case 's':
13332                             case 'S':
13333                                 cp_list = add_cp_to_invlist(cp_list,
13334                                                     LATIN_SMALL_LETTER_LONG_S);
13335                                 break;
13336                             case MICRO_SIGN:
13337                                 cp_list = add_cp_to_invlist(cp_list,
13338                                                     GREEK_CAPITAL_LETTER_MU);
13339                                 cp_list = add_cp_to_invlist(cp_list,
13340                                                     GREEK_SMALL_LETTER_MU);
13341                                 break;
13342                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13343                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13344                                 cp_list =
13345                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13346                                 break;
13347                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13348                                 cp_list = add_cp_to_invlist(cp_list,
13349                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13350                                 break;
13351                             case LATIN_SMALL_LETTER_SHARP_S:
13352                                 cp_list = add_cp_to_invlist(cp_list,
13353                                                 LATIN_CAPITAL_LETTER_SHARP_S);
13354                                 break;
13355                             case 'F': case 'f':
13356                             case 'I': case 'i':
13357                             case 'L': case 'l':
13358                             case 'T': case 't':
13359                             case 'A': case 'a':
13360                             case 'H': case 'h':
13361                             case 'J': case 'j':
13362                             case 'N': case 'n':
13363                             case 'W': case 'w':
13364                             case 'Y': case 'y':
13365                                 /* These all are targets of multi-character
13366                                  * folds from code points that require UTF8 to
13367                                  * express, so they can't match unless the
13368                                  * target string is in UTF-8, so no action here
13369                                  * is necessary, as regexec.c properly handles
13370                                  * the general case for UTF-8 matching and
13371                                  * multi-char folds */
13372                                 break;
13373                             default:
13374                                 /* Use deprecated warning to increase the
13375                                  * chances of this being output */
13376                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13377                                 break;
13378                         }
13379                     }
13380                     continue;
13381                 }
13382
13383                 /* Here is an above Latin1 character.  We don't have the rules
13384                  * hard-coded for it.  First, get its fold.  This is the simple
13385                  * fold, as the multi-character folds have been handled earlier
13386                  * and separated out */
13387                 _to_uni_fold_flags(j, foldbuf, &foldlen,
13388                                                ((LOC)
13389                                                ? FOLD_FLAGS_LOCALE
13390                                                : (ASCII_FOLD_RESTRICTED)
13391                                                   ? FOLD_FLAGS_NOMIX_ASCII
13392                                                   : 0));
13393
13394                 /* Single character fold of above Latin1.  Add everything in
13395                  * its fold closure to the list that this node should match.
13396                  * The fold closures data structure is a hash with the keys
13397                  * being the UTF-8 of every character that is folded to, like
13398                  * 'k', and the values each an array of all code points that
13399                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13400                  * Multi-character folds are not included */
13401                 if ((listp = hv_fetch(PL_utf8_foldclosures,
13402                                       (char *) foldbuf, foldlen, FALSE)))
13403                 {
13404                     AV* list = (AV*) *listp;
13405                     IV k;
13406                     for (k = 0; k <= av_len(list); k++) {
13407                         SV** c_p = av_fetch(list, k, FALSE);
13408                         UV c;
13409                         if (c_p == NULL) {
13410                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13411                         }
13412                         c = SvUV(*c_p);
13413
13414                         /* /aa doesn't allow folds between ASCII and non-; /l
13415                          * doesn't allow them between above and below 256 */
13416                         if ((ASCII_FOLD_RESTRICTED
13417                                   && (isASCII(c) != isASCII(j)))
13418                             || (LOC && ((c < 256) != (j < 256))))
13419                         {
13420                             continue;
13421                         }
13422
13423                         /* Folds involving non-ascii Latin1 characters
13424                          * under /d are added to a separate list */
13425                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13426                         {
13427                             cp_list = add_cp_to_invlist(cp_list, c);
13428                         }
13429                         else {
13430                           depends_list = add_cp_to_invlist(depends_list, c);
13431                         }
13432                     }
13433                 }
13434             }
13435         }
13436         SvREFCNT_dec_NN(fold_intersection);
13437     }
13438
13439     /* And combine the result (if any) with any inversion list from posix
13440      * classes.  The lists are kept separate up to now because we don't want to
13441      * fold the classes (folding of those is automatically handled by the swash
13442      * fetching code) */
13443     if (posixes) {
13444         if (! DEPENDS_SEMANTICS) {
13445             if (cp_list) {
13446                 _invlist_union(cp_list, posixes, &cp_list);
13447                 SvREFCNT_dec_NN(posixes);
13448             }
13449             else {
13450                 cp_list = posixes;
13451             }
13452         }
13453         else {
13454             /* Under /d, we put into a separate list the Latin1 things that
13455              * match only when the target string is utf8 */
13456             SV* nonascii_but_latin1_properties = NULL;
13457             _invlist_intersection(posixes, PL_Latin1,
13458                                   &nonascii_but_latin1_properties);
13459             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13460                               &nonascii_but_latin1_properties);
13461             _invlist_subtract(posixes, nonascii_but_latin1_properties,
13462                               &posixes);
13463             if (cp_list) {
13464                 _invlist_union(cp_list, posixes, &cp_list);
13465                 SvREFCNT_dec_NN(posixes);
13466             }
13467             else {
13468                 cp_list = posixes;
13469             }
13470
13471             if (depends_list) {
13472                 _invlist_union(depends_list, nonascii_but_latin1_properties,
13473                                &depends_list);
13474                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13475             }
13476             else {
13477                 depends_list = nonascii_but_latin1_properties;
13478             }
13479         }
13480     }
13481
13482     /* And combine the result (if any) with any inversion list from properties.
13483      * The lists are kept separate up to now so that we can distinguish the two
13484      * in regards to matching above-Unicode.  A run-time warning is generated
13485      * if a Unicode property is matched against a non-Unicode code point. But,
13486      * we allow user-defined properties to match anything, without any warning,
13487      * and we also suppress the warning if there is a portion of the character
13488      * class that isn't a Unicode property, and which matches above Unicode, \W
13489      * or [\x{110000}] for example.
13490      * (Note that in this case, unlike the Posix one above, there is no
13491      * <depends_list>, because having a Unicode property forces Unicode
13492      * semantics */
13493     if (properties) {
13494         bool warn_super = ! has_user_defined_property;
13495         if (cp_list) {
13496
13497             /* If it matters to the final outcome, see if a non-property
13498              * component of the class matches above Unicode.  If so, the
13499              * warning gets suppressed.  This is true even if just a single
13500              * such code point is specified, as though not strictly correct if
13501              * another such code point is matched against, the fact that they
13502              * are using above-Unicode code points indicates they should know
13503              * the issues involved */
13504             if (warn_super) {
13505                 bool non_prop_matches_above_Unicode =
13506                             runtime_posix_matches_above_Unicode
13507                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13508                 if (invert) {
13509                     non_prop_matches_above_Unicode =
13510                                             !  non_prop_matches_above_Unicode;
13511                 }
13512                 warn_super = ! non_prop_matches_above_Unicode;
13513             }
13514
13515             _invlist_union(properties, cp_list, &cp_list);
13516             SvREFCNT_dec_NN(properties);
13517         }
13518         else {
13519             cp_list = properties;
13520         }
13521
13522         if (warn_super) {
13523             OP(ret) = ANYOF_WARN_SUPER;
13524         }
13525     }
13526
13527     /* Here, we have calculated what code points should be in the character
13528      * class.
13529      *
13530      * Now we can see about various optimizations.  Fold calculation (which we
13531      * did above) needs to take place before inversion.  Otherwise /[^k]/i
13532      * would invert to include K, which under /i would match k, which it
13533      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13534      * folded until runtime */
13535
13536     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13537      * at compile time.  Besides not inverting folded locale now, we can't
13538      * invert if there are things such as \w, which aren't known until runtime
13539      * */
13540     if (invert
13541         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13542         && ! depends_list
13543         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13544     {
13545         _invlist_invert(cp_list);
13546
13547         /* Any swash can't be used as-is, because we've inverted things */
13548         if (swash) {
13549             SvREFCNT_dec_NN(swash);
13550             swash = NULL;
13551         }
13552
13553         /* Clear the invert flag since have just done it here */
13554         invert = FALSE;
13555     }
13556
13557     if (ret_invlist) {
13558         *ret_invlist = cp_list;
13559
13560         /* Discard the generated node */
13561         if (SIZE_ONLY) {
13562             RExC_size = orig_size;
13563         }
13564         else {
13565             RExC_emit = orig_emit;
13566         }
13567         return orig_emit;
13568     }
13569
13570     /* If we didn't do folding, it's because some information isn't available
13571      * until runtime; set the run-time fold flag for these.  (We don't have to
13572      * worry about properties folding, as that is taken care of by the swash
13573      * fetching) */
13574     if (FOLD && LOC)
13575     {
13576        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13577     }
13578
13579     /* Some character classes are equivalent to other nodes.  Such nodes take
13580      * up less room and generally fewer operations to execute than ANYOF nodes.
13581      * Above, we checked for and optimized into some such equivalents for
13582      * certain common classes that are easy to test.  Getting to this point in
13583      * the code means that the class didn't get optimized there.  Since this
13584      * code is only executed in Pass 2, it is too late to save space--it has
13585      * been allocated in Pass 1, and currently isn't given back.  But turning
13586      * things into an EXACTish node can allow the optimizer to join it to any
13587      * adjacent such nodes.  And if the class is equivalent to things like /./,
13588      * expensive run-time swashes can be avoided.  Now that we have more
13589      * complete information, we can find things necessarily missed by the
13590      * earlier code.  I (khw) am not sure how much to look for here.  It would
13591      * be easy, but perhaps too slow, to check any candidates against all the
13592      * node types they could possibly match using _invlistEQ(). */
13593
13594     if (cp_list
13595         && ! invert
13596         && ! depends_list
13597         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13598         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13599     {
13600         UV start, end;
13601         U8 op = END;  /* The optimzation node-type */
13602         const char * cur_parse= RExC_parse;
13603
13604         invlist_iterinit(cp_list);
13605         if (! invlist_iternext(cp_list, &start, &end)) {
13606
13607             /* Here, the list is empty.  This happens, for example, when a
13608              * Unicode property is the only thing in the character class, and
13609              * it doesn't match anything.  (perluniprops.pod notes such
13610              * properties) */
13611             op = OPFAIL;
13612             *flagp |= HASWIDTH|SIMPLE;
13613         }
13614         else if (start == end) {    /* The range is a single code point */
13615             if (! invlist_iternext(cp_list, &start, &end)
13616
13617                     /* Don't do this optimization if it would require changing
13618                      * the pattern to UTF-8 */
13619                 && (start < 256 || UTF))
13620             {
13621                 /* Here, the list contains a single code point.  Can optimize
13622                  * into an EXACT node */
13623
13624                 value = start;
13625
13626                 if (! FOLD) {
13627                     op = EXACT;
13628                 }
13629                 else if (LOC) {
13630
13631                     /* A locale node under folding with one code point can be
13632                      * an EXACTFL, as its fold won't be calculated until
13633                      * runtime */
13634                     op = EXACTFL;
13635                 }
13636                 else {
13637
13638                     /* Here, we are generally folding, but there is only one
13639                      * code point to match.  If we have to, we use an EXACT
13640                      * node, but it would be better for joining with adjacent
13641                      * nodes in the optimization pass if we used the same
13642                      * EXACTFish node that any such are likely to be.  We can
13643                      * do this iff the code point doesn't participate in any
13644                      * folds.  For example, an EXACTF of a colon is the same as
13645                      * an EXACT one, since nothing folds to or from a colon. */
13646                     if (value < 256) {
13647                         if (IS_IN_SOME_FOLD_L1(value)) {
13648                             op = EXACT;
13649                         }
13650                     }
13651                     else {
13652                         if (! PL_utf8_foldable) {
13653                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13654                                                 &PL_sv_undef, 1, 0);
13655                             PL_utf8_foldable = _get_swash_invlist(swash);
13656                             SvREFCNT_dec_NN(swash);
13657                         }
13658                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13659                             op = EXACT;
13660                         }
13661                     }
13662
13663                     /* If we haven't found the node type, above, it means we
13664                      * can use the prevailing one */
13665                     if (op == END) {
13666                         op = compute_EXACTish(pRExC_state);
13667                     }
13668                 }
13669             }
13670         }
13671         else if (start == 0) {
13672             if (end == UV_MAX) {
13673                 op = SANY;
13674                 *flagp |= HASWIDTH|SIMPLE;
13675                 RExC_naughty++;
13676             }
13677             else if (end == '\n' - 1
13678                     && invlist_iternext(cp_list, &start, &end)
13679                     && start == '\n' + 1 && end == UV_MAX)
13680             {
13681                 op = REG_ANY;
13682                 *flagp |= HASWIDTH|SIMPLE;
13683                 RExC_naughty++;
13684             }
13685         }
13686         invlist_iterfinish(cp_list);
13687
13688         if (op != END) {
13689             RExC_parse = (char *)orig_parse;
13690             RExC_emit = (regnode *)orig_emit;
13691
13692             ret = reg_node(pRExC_state, op);
13693
13694             RExC_parse = (char *)cur_parse;
13695
13696             if (PL_regkind[op] == EXACT) {
13697                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13698             }
13699
13700             SvREFCNT_dec_NN(cp_list);
13701             return ret;
13702         }
13703     }
13704
13705     /* Here, <cp_list> contains all the code points we can determine at
13706      * compile time that match under all conditions.  Go through it, and
13707      * for things that belong in the bitmap, put them there, and delete from
13708      * <cp_list>.  While we are at it, see if everything above 255 is in the
13709      * list, and if so, set a flag to speed up execution */
13710     ANYOF_BITMAP_ZERO(ret);
13711     if (cp_list) {
13712
13713         /* This gets set if we actually need to modify things */
13714         bool change_invlist = FALSE;
13715
13716         UV start, end;
13717
13718         /* Start looking through <cp_list> */
13719         invlist_iterinit(cp_list);
13720         while (invlist_iternext(cp_list, &start, &end)) {
13721             UV high;
13722             int i;
13723
13724             if (end == UV_MAX && start <= 256) {
13725                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13726             }
13727
13728             /* Quit if are above what we should change */
13729             if (start > 255) {
13730                 break;
13731             }
13732
13733             change_invlist = TRUE;
13734
13735             /* Set all the bits in the range, up to the max that we are doing */
13736             high = (end < 255) ? end : 255;
13737             for (i = start; i <= (int) high; i++) {
13738                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13739                     ANYOF_BITMAP_SET(ret, i);
13740                     prevvalue = value;
13741                     value = i;
13742                 }
13743             }
13744         }
13745         invlist_iterfinish(cp_list);
13746
13747         /* Done with loop; remove any code points that are in the bitmap from
13748          * <cp_list> */
13749         if (change_invlist) {
13750             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13751         }
13752
13753         /* If have completely emptied it, remove it completely */
13754         if (_invlist_len(cp_list) == 0) {
13755             SvREFCNT_dec_NN(cp_list);
13756             cp_list = NULL;
13757         }
13758     }
13759
13760     if (invert) {
13761         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13762     }
13763
13764     /* Here, the bitmap has been populated with all the Latin1 code points that
13765      * always match.  Can now add to the overall list those that match only
13766      * when the target string is UTF-8 (<depends_list>). */
13767     if (depends_list) {
13768         if (cp_list) {
13769             _invlist_union(cp_list, depends_list, &cp_list);
13770             SvREFCNT_dec_NN(depends_list);
13771         }
13772         else {
13773             cp_list = depends_list;
13774         }
13775     }
13776
13777     /* If there is a swash and more than one element, we can't use the swash in
13778      * the optimization below. */
13779     if (swash && element_count > 1) {
13780         SvREFCNT_dec_NN(swash);
13781         swash = NULL;
13782     }
13783
13784     if (! cp_list
13785         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13786     {
13787         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13788     }
13789     else {
13790         /* av[0] stores the character class description in its textual form:
13791          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13792          *       appropriate swash, and is also useful for dumping the regnode.
13793          * av[1] if NULL, is a placeholder to later contain the swash computed
13794          *       from av[0].  But if no further computation need be done, the
13795          *       swash is stored there now.
13796          * av[2] stores the cp_list inversion list for use in addition or
13797          *       instead of av[0]; used only if av[1] is NULL
13798          * av[3] is set if any component of the class is from a user-defined
13799          *       property; used only if av[1] is NULL */
13800         AV * const av = newAV();
13801         SV *rv;
13802
13803         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13804                         ? SvREFCNT_inc(listsv) : &PL_sv_undef);
13805         if (swash) {
13806             av_store(av, 1, swash);
13807             SvREFCNT_dec_NN(cp_list);
13808         }
13809         else {
13810             av_store(av, 1, NULL);
13811             if (cp_list) {
13812                 av_store(av, 2, cp_list);
13813                 av_store(av, 3, newSVuv(has_user_defined_property));
13814             }
13815         }
13816
13817         rv = newRV_noinc(MUTABLE_SV(av));
13818         n = add_data(pRExC_state, 1, "s");
13819         RExC_rxi->data->data[n] = (void*)rv;
13820         ARG_SET(ret, n);
13821     }
13822
13823     *flagp |= HASWIDTH|SIMPLE;
13824     return ret;
13825 }
13826 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13827
13828
13829 /* reg_skipcomment()
13830
13831    Absorbs an /x style # comments from the input stream.
13832    Returns true if there is more text remaining in the stream.
13833    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13834    terminates the pattern without including a newline.
13835
13836    Note its the callers responsibility to ensure that we are
13837    actually in /x mode
13838
13839 */
13840
13841 STATIC bool
13842 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13843 {
13844     bool ended = 0;
13845
13846     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13847
13848     while (RExC_parse < RExC_end)
13849         if (*RExC_parse++ == '\n') {
13850             ended = 1;
13851             break;
13852         }
13853     if (!ended) {
13854         /* we ran off the end of the pattern without ending
13855            the comment, so we have to add an \n when wrapping */
13856         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13857         return 0;
13858     } else
13859         return 1;
13860 }
13861
13862 /* nextchar()
13863
13864    Advances the parse position, and optionally absorbs
13865    "whitespace" from the inputstream.
13866
13867    Without /x "whitespace" means (?#...) style comments only,
13868    with /x this means (?#...) and # comments and whitespace proper.
13869
13870    Returns the RExC_parse point from BEFORE the scan occurs.
13871
13872    This is the /x friendly way of saying RExC_parse++.
13873 */
13874
13875 STATIC char*
13876 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13877 {
13878     char* const retval = RExC_parse++;
13879
13880     PERL_ARGS_ASSERT_NEXTCHAR;
13881
13882     for (;;) {
13883         if (RExC_end - RExC_parse >= 3
13884             && *RExC_parse == '('
13885             && RExC_parse[1] == '?'
13886             && RExC_parse[2] == '#')
13887         {
13888             while (*RExC_parse != ')') {
13889                 if (RExC_parse == RExC_end)
13890                     FAIL("Sequence (?#... not terminated");
13891                 RExC_parse++;
13892             }
13893             RExC_parse++;
13894             continue;
13895         }
13896         if (RExC_flags & RXf_PMf_EXTENDED) {
13897             if (isSPACE(*RExC_parse)) {
13898                 RExC_parse++;
13899                 continue;
13900             }
13901             else if (*RExC_parse == '#') {
13902                 if ( reg_skipcomment( pRExC_state ) )
13903                     continue;
13904             }
13905         }
13906         return retval;
13907     }
13908 }
13909
13910 /*
13911 - reg_node - emit a node
13912 */
13913 STATIC regnode *                        /* Location. */
13914 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13915 {
13916     dVAR;
13917     regnode *ptr;
13918     regnode * const ret = RExC_emit;
13919     GET_RE_DEBUG_FLAGS_DECL;
13920
13921     PERL_ARGS_ASSERT_REG_NODE;
13922
13923     if (SIZE_ONLY) {
13924         SIZE_ALIGN(RExC_size);
13925         RExC_size += 1;
13926         return(ret);
13927     }
13928     if (RExC_emit >= RExC_emit_bound)
13929         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13930                    op, RExC_emit, RExC_emit_bound);
13931
13932     NODE_ALIGN_FILL(ret);
13933     ptr = ret;
13934     FILL_ADVANCE_NODE(ptr, op);
13935     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13936 #ifdef RE_TRACK_PATTERN_OFFSETS
13937     if (RExC_offsets) {         /* MJD */
13938         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13939               "reg_node", __LINE__,
13940               PL_reg_name[op],
13941               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13942                 ? "Overwriting end of array!\n" : "OK",
13943               (UV)(RExC_emit - RExC_emit_start),
13944               (UV)(RExC_parse - RExC_start),
13945               (UV)RExC_offsets[0]));
13946         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13947     }
13948 #endif
13949     RExC_emit = ptr;
13950     return(ret);
13951 }
13952
13953 /*
13954 - reganode - emit a node with an argument
13955 */
13956 STATIC regnode *                        /* Location. */
13957 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13958 {
13959     dVAR;
13960     regnode *ptr;
13961     regnode * const ret = RExC_emit;
13962     GET_RE_DEBUG_FLAGS_DECL;
13963
13964     PERL_ARGS_ASSERT_REGANODE;
13965
13966     if (SIZE_ONLY) {
13967         SIZE_ALIGN(RExC_size);
13968         RExC_size += 2;
13969         /*
13970            We can't do this:
13971
13972            assert(2==regarglen[op]+1);
13973
13974            Anything larger than this has to allocate the extra amount.
13975            If we changed this to be:
13976
13977            RExC_size += (1 + regarglen[op]);
13978
13979            then it wouldn't matter. Its not clear what side effect
13980            might come from that so its not done so far.
13981            -- dmq
13982         */
13983         return(ret);
13984     }
13985     if (RExC_emit >= RExC_emit_bound)
13986         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13987                    op, RExC_emit, RExC_emit_bound);
13988
13989     NODE_ALIGN_FILL(ret);
13990     ptr = ret;
13991     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13992     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13993 #ifdef RE_TRACK_PATTERN_OFFSETS
13994     if (RExC_offsets) {         /* MJD */
13995         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13996               "reganode",
13997               __LINE__,
13998               PL_reg_name[op],
13999               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14000               "Overwriting end of array!\n" : "OK",
14001               (UV)(RExC_emit - RExC_emit_start),
14002               (UV)(RExC_parse - RExC_start),
14003               (UV)RExC_offsets[0]));
14004         Set_Cur_Node_Offset;
14005     }
14006 #endif
14007     RExC_emit = ptr;
14008     return(ret);
14009 }
14010
14011 /*
14012 - reguni - emit (if appropriate) a Unicode character
14013 */
14014 STATIC STRLEN
14015 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14016 {
14017     dVAR;
14018
14019     PERL_ARGS_ASSERT_REGUNI;
14020
14021     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14022 }
14023
14024 /*
14025 - reginsert - insert an operator in front of already-emitted operand
14026 *
14027 * Means relocating the operand.
14028 */
14029 STATIC void
14030 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14031 {
14032     dVAR;
14033     regnode *src;
14034     regnode *dst;
14035     regnode *place;
14036     const int offset = regarglen[(U8)op];
14037     const int size = NODE_STEP_REGNODE + offset;
14038     GET_RE_DEBUG_FLAGS_DECL;
14039
14040     PERL_ARGS_ASSERT_REGINSERT;
14041     PERL_UNUSED_ARG(depth);
14042 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14043     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14044     if (SIZE_ONLY) {
14045         RExC_size += size;
14046         return;
14047     }
14048
14049     src = RExC_emit;
14050     RExC_emit += size;
14051     dst = RExC_emit;
14052     if (RExC_open_parens) {
14053         int paren;
14054         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14055         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14056             if ( RExC_open_parens[paren] >= opnd ) {
14057                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14058                 RExC_open_parens[paren] += size;
14059             } else {
14060                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14061             }
14062             if ( RExC_close_parens[paren] >= opnd ) {
14063                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14064                 RExC_close_parens[paren] += size;
14065             } else {
14066                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14067             }
14068         }
14069     }
14070
14071     while (src > opnd) {
14072         StructCopy(--src, --dst, regnode);
14073 #ifdef RE_TRACK_PATTERN_OFFSETS
14074         if (RExC_offsets) {     /* MJD 20010112 */
14075             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14076                   "reg_insert",
14077                   __LINE__,
14078                   PL_reg_name[op],
14079                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14080                     ? "Overwriting end of array!\n" : "OK",
14081                   (UV)(src - RExC_emit_start),
14082                   (UV)(dst - RExC_emit_start),
14083                   (UV)RExC_offsets[0]));
14084             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14085             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14086         }
14087 #endif
14088     }
14089
14090
14091     place = opnd;               /* Op node, where operand used to be. */
14092 #ifdef RE_TRACK_PATTERN_OFFSETS
14093     if (RExC_offsets) {         /* MJD */
14094         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14095               "reginsert",
14096               __LINE__,
14097               PL_reg_name[op],
14098               (UV)(place - RExC_emit_start) > RExC_offsets[0]
14099               ? "Overwriting end of array!\n" : "OK",
14100               (UV)(place - RExC_emit_start),
14101               (UV)(RExC_parse - RExC_start),
14102               (UV)RExC_offsets[0]));
14103         Set_Node_Offset(place, RExC_parse);
14104         Set_Node_Length(place, 1);
14105     }
14106 #endif
14107     src = NEXTOPER(place);
14108     FILL_ADVANCE_NODE(place, op);
14109     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
14110     Zero(src, offset, regnode);
14111 }
14112
14113 /*
14114 - regtail - set the next-pointer at the end of a node chain of p to val.
14115 - SEE ALSO: regtail_study
14116 */
14117 /* TODO: All three parms should be const */
14118 STATIC void
14119 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14120 {
14121     dVAR;
14122     regnode *scan;
14123     GET_RE_DEBUG_FLAGS_DECL;
14124
14125     PERL_ARGS_ASSERT_REGTAIL;
14126 #ifndef DEBUGGING
14127     PERL_UNUSED_ARG(depth);
14128 #endif
14129
14130     if (SIZE_ONLY)
14131         return;
14132
14133     /* Find last node. */
14134     scan = p;
14135     for (;;) {
14136         regnode * const temp = regnext(scan);
14137         DEBUG_PARSE_r({
14138             SV * const mysv=sv_newmortal();
14139             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14140             regprop(RExC_rx, mysv, scan);
14141             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14142                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14143                     (temp == NULL ? "->" : ""),
14144                     (temp == NULL ? PL_reg_name[OP(val)] : "")
14145             );
14146         });
14147         if (temp == NULL)
14148             break;
14149         scan = temp;
14150     }
14151
14152     if (reg_off_by_arg[OP(scan)]) {
14153         ARG_SET(scan, val - scan);
14154     }
14155     else {
14156         NEXT_OFF(scan) = val - scan;
14157     }
14158 }
14159
14160 #ifdef DEBUGGING
14161 /*
14162 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14163 - Look for optimizable sequences at the same time.
14164 - currently only looks for EXACT chains.
14165
14166 This is experimental code. The idea is to use this routine to perform
14167 in place optimizations on branches and groups as they are constructed,
14168 with the long term intention of removing optimization from study_chunk so
14169 that it is purely analytical.
14170
14171 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14172 to control which is which.
14173
14174 */
14175 /* TODO: All four parms should be const */
14176
14177 STATIC U8
14178 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14179 {
14180     dVAR;
14181     regnode *scan;
14182     U8 exact = PSEUDO;
14183 #ifdef EXPERIMENTAL_INPLACESCAN
14184     I32 min = 0;
14185 #endif
14186     GET_RE_DEBUG_FLAGS_DECL;
14187
14188     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14189
14190
14191     if (SIZE_ONLY)
14192         return exact;
14193
14194     /* Find last node. */
14195
14196     scan = p;
14197     for (;;) {
14198         regnode * const temp = regnext(scan);
14199 #ifdef EXPERIMENTAL_INPLACESCAN
14200         if (PL_regkind[OP(scan)] == EXACT) {
14201             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14202             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14203                 return EXACT;
14204         }
14205 #endif
14206         if ( exact ) {
14207             switch (OP(scan)) {
14208                 case EXACT:
14209                 case EXACTF:
14210                 case EXACTFA:
14211                 case EXACTFU:
14212                 case EXACTFU_SS:
14213                 case EXACTFU_TRICKYFOLD:
14214                 case EXACTFL:
14215                         if( exact == PSEUDO )
14216                             exact= OP(scan);
14217                         else if ( exact != OP(scan) )
14218                             exact= 0;
14219                 case NOTHING:
14220                     break;
14221                 default:
14222                     exact= 0;
14223             }
14224         }
14225         DEBUG_PARSE_r({
14226             SV * const mysv=sv_newmortal();
14227             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14228             regprop(RExC_rx, mysv, scan);
14229             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14230                 SvPV_nolen_const(mysv),
14231                 REG_NODE_NUM(scan),
14232                 PL_reg_name[exact]);
14233         });
14234         if (temp == NULL)
14235             break;
14236         scan = temp;
14237     }
14238     DEBUG_PARSE_r({
14239         SV * const mysv_val=sv_newmortal();
14240         DEBUG_PARSE_MSG("");
14241         regprop(RExC_rx, mysv_val, val);
14242         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14243                       SvPV_nolen_const(mysv_val),
14244                       (IV)REG_NODE_NUM(val),
14245                       (IV)(val - scan)
14246         );
14247     });
14248     if (reg_off_by_arg[OP(scan)]) {
14249         ARG_SET(scan, val - scan);
14250     }
14251     else {
14252         NEXT_OFF(scan) = val - scan;
14253     }
14254
14255     return exact;
14256 }
14257 #endif
14258
14259 /*
14260  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14261  */
14262 #ifdef DEBUGGING
14263 static void
14264 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14265 {
14266     int bit;
14267     int set=0;
14268     regex_charset cs;
14269
14270     for (bit=0; bit<32; bit++) {
14271         if (flags & (1<<bit)) {
14272             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
14273                 continue;
14274             }
14275             if (!set++ && lead)
14276                 PerlIO_printf(Perl_debug_log, "%s",lead);
14277             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14278         }
14279     }
14280     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14281             if (!set++ && lead) {
14282                 PerlIO_printf(Perl_debug_log, "%s",lead);
14283             }
14284             switch (cs) {
14285                 case REGEX_UNICODE_CHARSET:
14286                     PerlIO_printf(Perl_debug_log, "UNICODE");
14287                     break;
14288                 case REGEX_LOCALE_CHARSET:
14289                     PerlIO_printf(Perl_debug_log, "LOCALE");
14290                     break;
14291                 case REGEX_ASCII_RESTRICTED_CHARSET:
14292                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14293                     break;
14294                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14295                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14296                     break;
14297                 default:
14298                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14299                     break;
14300             }
14301     }
14302     if (lead)  {
14303         if (set)
14304             PerlIO_printf(Perl_debug_log, "\n");
14305         else
14306             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14307     }
14308 }
14309 #endif
14310
14311 void
14312 Perl_regdump(pTHX_ const regexp *r)
14313 {
14314 #ifdef DEBUGGING
14315     dVAR;
14316     SV * const sv = sv_newmortal();
14317     SV *dsv= sv_newmortal();
14318     RXi_GET_DECL(r,ri);
14319     GET_RE_DEBUG_FLAGS_DECL;
14320
14321     PERL_ARGS_ASSERT_REGDUMP;
14322
14323     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14324
14325     /* Header fields of interest. */
14326     if (r->anchored_substr) {
14327         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14328             RE_SV_DUMPLEN(r->anchored_substr), 30);
14329         PerlIO_printf(Perl_debug_log,
14330                       "anchored %s%s at %"IVdf" ",
14331                       s, RE_SV_TAIL(r->anchored_substr),
14332                       (IV)r->anchored_offset);
14333     } else if (r->anchored_utf8) {
14334         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14335             RE_SV_DUMPLEN(r->anchored_utf8), 30);
14336         PerlIO_printf(Perl_debug_log,
14337                       "anchored utf8 %s%s at %"IVdf" ",
14338                       s, RE_SV_TAIL(r->anchored_utf8),
14339                       (IV)r->anchored_offset);
14340     }
14341     if (r->float_substr) {
14342         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14343             RE_SV_DUMPLEN(r->float_substr), 30);
14344         PerlIO_printf(Perl_debug_log,
14345                       "floating %s%s at %"IVdf"..%"UVuf" ",
14346                       s, RE_SV_TAIL(r->float_substr),
14347                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14348     } else if (r->float_utf8) {
14349         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14350             RE_SV_DUMPLEN(r->float_utf8), 30);
14351         PerlIO_printf(Perl_debug_log,
14352                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14353                       s, RE_SV_TAIL(r->float_utf8),
14354                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14355     }
14356     if (r->check_substr || r->check_utf8)
14357         PerlIO_printf(Perl_debug_log,
14358                       (const char *)
14359                       (r->check_substr == r->float_substr
14360                        && r->check_utf8 == r->float_utf8
14361                        ? "(checking floating" : "(checking anchored"));
14362     if (r->extflags & RXf_NOSCAN)
14363         PerlIO_printf(Perl_debug_log, " noscan");
14364     if (r->extflags & RXf_CHECK_ALL)
14365         PerlIO_printf(Perl_debug_log, " isall");
14366     if (r->check_substr || r->check_utf8)
14367         PerlIO_printf(Perl_debug_log, ") ");
14368
14369     if (ri->regstclass) {
14370         regprop(r, sv, ri->regstclass);
14371         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14372     }
14373     if (r->extflags & RXf_ANCH) {
14374         PerlIO_printf(Perl_debug_log, "anchored");
14375         if (r->extflags & RXf_ANCH_BOL)
14376             PerlIO_printf(Perl_debug_log, "(BOL)");
14377         if (r->extflags & RXf_ANCH_MBOL)
14378             PerlIO_printf(Perl_debug_log, "(MBOL)");
14379         if (r->extflags & RXf_ANCH_SBOL)
14380             PerlIO_printf(Perl_debug_log, "(SBOL)");
14381         if (r->extflags & RXf_ANCH_GPOS)
14382             PerlIO_printf(Perl_debug_log, "(GPOS)");
14383         PerlIO_putc(Perl_debug_log, ' ');
14384     }
14385     if (r->extflags & RXf_GPOS_SEEN)
14386         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14387     if (r->intflags & PREGf_SKIP)
14388         PerlIO_printf(Perl_debug_log, "plus ");
14389     if (r->intflags & PREGf_IMPLICIT)
14390         PerlIO_printf(Perl_debug_log, "implicit ");
14391     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14392     if (r->extflags & RXf_EVAL_SEEN)
14393         PerlIO_printf(Perl_debug_log, "with eval ");
14394     PerlIO_printf(Perl_debug_log, "\n");
14395     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14396 #else
14397     PERL_ARGS_ASSERT_REGDUMP;
14398     PERL_UNUSED_CONTEXT;
14399     PERL_UNUSED_ARG(r);
14400 #endif  /* DEBUGGING */
14401 }
14402
14403 /*
14404 - regprop - printable representation of opcode
14405 */
14406 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14407 STMT_START { \
14408         if (do_sep) {                           \
14409             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14410             if (flags & ANYOF_INVERT)           \
14411                 /*make sure the invert info is in each */ \
14412                 sv_catpvs(sv, "^");             \
14413             do_sep = 0;                         \
14414         }                                       \
14415 } STMT_END
14416
14417 void
14418 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14419 {
14420 #ifdef DEBUGGING
14421     dVAR;
14422     int k;
14423
14424     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14425     static const char * const anyofs[] = {
14426 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14427     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14428     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14429     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14430     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14431     || _CC_VERTSPACE != 16
14432   #error Need to adjust order of anyofs[]
14433 #endif
14434         "[\\w]",
14435         "[\\W]",
14436         "[\\d]",
14437         "[\\D]",
14438         "[:alpha:]",
14439         "[:^alpha:]",
14440         "[:lower:]",
14441         "[:^lower:]",
14442         "[:upper:]",
14443         "[:^upper:]",
14444         "[:punct:]",
14445         "[:^punct:]",
14446         "[:print:]",
14447         "[:^print:]",
14448         "[:alnum:]",
14449         "[:^alnum:]",
14450         "[:graph:]",
14451         "[:^graph:]",
14452         "[:cased:]",
14453         "[:^cased:]",
14454         "[\\s]",
14455         "[\\S]",
14456         "[:blank:]",
14457         "[:^blank:]",
14458         "[:xdigit:]",
14459         "[:^xdigit:]",
14460         "[:space:]",
14461         "[:^space:]",
14462         "[:cntrl:]",
14463         "[:^cntrl:]",
14464         "[:ascii:]",
14465         "[:^ascii:]",
14466         "[\\v]",
14467         "[\\V]"
14468     };
14469     RXi_GET_DECL(prog,progi);
14470     GET_RE_DEBUG_FLAGS_DECL;
14471
14472     PERL_ARGS_ASSERT_REGPROP;
14473
14474     sv_setpvs(sv, "");
14475
14476     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
14477         /* It would be nice to FAIL() here, but this may be called from
14478            regexec.c, and it would be hard to supply pRExC_state. */
14479         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14480     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14481
14482     k = PL_regkind[OP(o)];
14483
14484     if (k == EXACT) {
14485         sv_catpvs(sv, " ");
14486         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14487          * is a crude hack but it may be the best for now since
14488          * we have no flag "this EXACTish node was UTF-8"
14489          * --jhi */
14490         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14491                   PERL_PV_ESCAPE_UNI_DETECT |
14492                   PERL_PV_ESCAPE_NONASCII   |
14493                   PERL_PV_PRETTY_ELLIPSES   |
14494                   PERL_PV_PRETTY_LTGT       |
14495                   PERL_PV_PRETTY_NOCLEAR
14496                   );
14497     } else if (k == TRIE) {
14498         /* print the details of the trie in dumpuntil instead, as
14499          * progi->data isn't available here */
14500         const char op = OP(o);
14501         const U32 n = ARG(o);
14502         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14503                (reg_ac_data *)progi->data->data[n] :
14504                NULL;
14505         const reg_trie_data * const trie
14506             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14507
14508         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14509         DEBUG_TRIE_COMPILE_r(
14510             Perl_sv_catpvf(aTHX_ sv,
14511                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14512                 (UV)trie->startstate,
14513                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14514                 (UV)trie->wordcount,
14515                 (UV)trie->minlen,
14516                 (UV)trie->maxlen,
14517                 (UV)TRIE_CHARCOUNT(trie),
14518                 (UV)trie->uniquecharcount
14519             )
14520         );
14521         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14522             int i;
14523             int rangestart = -1;
14524             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14525             sv_catpvs(sv, "[");
14526             for (i = 0; i <= 256; i++) {
14527                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14528                     if (rangestart == -1)
14529                         rangestart = i;
14530                 } else if (rangestart != -1) {
14531                     if (i <= rangestart + 3)
14532                         for (; rangestart < i; rangestart++)
14533                             put_byte(sv, rangestart);
14534                     else {
14535                         put_byte(sv, rangestart);
14536                         sv_catpvs(sv, "-");
14537                         put_byte(sv, i - 1);
14538                     }
14539                     rangestart = -1;
14540                 }
14541             }
14542             sv_catpvs(sv, "]");
14543         }
14544
14545     } else if (k == CURLY) {
14546         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14547             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14548         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14549     }
14550     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
14551         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14552     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14553         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
14554         if ( RXp_PAREN_NAMES(prog) ) {
14555             if ( k != REF || (OP(o) < NREF)) {
14556                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14557                 SV **name= av_fetch(list, ARG(o), 0 );
14558                 if (name)
14559                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14560             }
14561             else {
14562                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14563                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14564                 I32 *nums=(I32*)SvPVX(sv_dat);
14565                 SV **name= av_fetch(list, nums[0], 0 );
14566                 I32 n;
14567                 if (name) {
14568                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
14569                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14570                                     (n ? "," : ""), (IV)nums[n]);
14571                     }
14572                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14573                 }
14574             }
14575         }
14576     } else if (k == GOSUB)
14577         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14578     else if (k == VERB) {
14579         if (!o->flags)
14580             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14581                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14582     } else if (k == LOGICAL)
14583         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
14584     else if (k == ANYOF) {
14585         int i, rangestart = -1;
14586         const U8 flags = ANYOF_FLAGS(o);
14587         int do_sep = 0;
14588
14589
14590         if (flags & ANYOF_LOCALE)
14591             sv_catpvs(sv, "{loc}");
14592         if (flags & ANYOF_LOC_FOLD)
14593             sv_catpvs(sv, "{i}");
14594         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14595         if (flags & ANYOF_INVERT)
14596             sv_catpvs(sv, "^");
14597
14598         /* output what the standard cp 0-255 bitmap matches */
14599         for (i = 0; i <= 256; i++) {
14600             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14601                 if (rangestart == -1)
14602                     rangestart = i;
14603             } else if (rangestart != -1) {
14604                 if (i <= rangestart + 3)
14605                     for (; rangestart < i; rangestart++)
14606                         put_byte(sv, rangestart);
14607                 else {
14608                     put_byte(sv, rangestart);
14609                     sv_catpvs(sv, "-");
14610                     put_byte(sv, i - 1);
14611                 }
14612                 do_sep = 1;
14613                 rangestart = -1;
14614             }
14615         }
14616
14617         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14618         /* output any special charclass tests (used entirely under use locale) */
14619         if (ANYOF_CLASS_TEST_ANY_SET(o))
14620             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14621                 if (ANYOF_CLASS_TEST(o,i)) {
14622                     sv_catpv(sv, anyofs[i]);
14623                     do_sep = 1;
14624                 }
14625
14626         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14627
14628         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14629             sv_catpvs(sv, "{non-utf8-latin1-all}");
14630         }
14631
14632         /* output information about the unicode matching */
14633         if (flags & ANYOF_UNICODE_ALL)
14634             sv_catpvs(sv, "{unicode_all}");
14635         else if (ANYOF_NONBITMAP(o))
14636             sv_catpvs(sv, "{unicode}");
14637         if (flags & ANYOF_NONBITMAP_NON_UTF8)
14638             sv_catpvs(sv, "{outside bitmap}");
14639
14640         if (ANYOF_NONBITMAP(o)) {
14641             SV *lv; /* Set if there is something outside the bit map */
14642             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14643             bool byte_output = FALSE;   /* If something in the bitmap has been
14644                                            output */
14645
14646             if (lv && lv != &PL_sv_undef) {
14647                 if (sw) {
14648                     U8 s[UTF8_MAXBYTES_CASE+1];
14649
14650                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14651                         uvchr_to_utf8(s, i);
14652
14653                         if (i < 256
14654                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14655                                                                things already
14656                                                                output as part
14657                                                                of the bitmap */
14658                             && swash_fetch(sw, s, TRUE))
14659                         {
14660                             if (rangestart == -1)
14661                                 rangestart = i;
14662                         } else if (rangestart != -1) {
14663                             byte_output = TRUE;
14664                             if (i <= rangestart + 3)
14665                                 for (; rangestart < i; rangestart++) {
14666                                     put_byte(sv, rangestart);
14667                                 }
14668                             else {
14669                                 put_byte(sv, rangestart);
14670                                 sv_catpvs(sv, "-");
14671                                 put_byte(sv, i-1);
14672                             }
14673                             rangestart = -1;
14674                         }
14675                     }
14676                 }
14677
14678                 {
14679                     char *s = savesvpv(lv);
14680                     char * const origs = s;
14681
14682                     while (*s && *s != '\n')
14683                         s++;
14684
14685                     if (*s == '\n') {
14686                         const char * const t = ++s;
14687
14688                         if (byte_output) {
14689                             sv_catpvs(sv, " ");
14690                         }
14691
14692                         while (*s) {
14693                             if (*s == '\n') {
14694
14695                                 /* Truncate very long output */
14696                                 if (s - origs > 256) {
14697                                     Perl_sv_catpvf(aTHX_ sv,
14698                                                    "%.*s...",
14699                                                    (int) (s - origs - 1),
14700                                                    t);
14701                                     goto out_dump;
14702                                 }
14703                                 *s = ' ';
14704                             }
14705                             else if (*s == '\t') {
14706                                 *s = '-';
14707                             }
14708                             s++;
14709                         }
14710                         if (s[-1] == ' ')
14711                             s[-1] = 0;
14712
14713                         sv_catpv(sv, t);
14714                     }
14715
14716                 out_dump:
14717
14718                     Safefree(origs);
14719                 }
14720                 SvREFCNT_dec_NN(lv);
14721             }
14722         }
14723
14724         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14725     }
14726     else if (k == POSIXD || k == NPOSIXD) {
14727         U8 index = FLAGS(o) * 2;
14728         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14729             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14730         }
14731         else {
14732             sv_catpv(sv, anyofs[index]);
14733         }
14734     }
14735     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14736         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14737 #else
14738     PERL_UNUSED_CONTEXT;
14739     PERL_UNUSED_ARG(sv);
14740     PERL_UNUSED_ARG(o);
14741     PERL_UNUSED_ARG(prog);
14742 #endif  /* DEBUGGING */
14743 }
14744
14745 SV *
14746 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14747 {                               /* Assume that RE_INTUIT is set */
14748     dVAR;
14749     struct regexp *const prog = ReANY(r);
14750     GET_RE_DEBUG_FLAGS_DECL;
14751
14752     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14753     PERL_UNUSED_CONTEXT;
14754
14755     DEBUG_COMPILE_r(
14756         {
14757             const char * const s = SvPV_nolen_const(prog->check_substr
14758                       ? prog->check_substr : prog->check_utf8);
14759
14760             if (!PL_colorset) reginitcolors();
14761             PerlIO_printf(Perl_debug_log,
14762                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14763                       PL_colors[4],
14764                       prog->check_substr ? "" : "utf8 ",
14765                       PL_colors[5],PL_colors[0],
14766                       s,
14767                       PL_colors[1],
14768                       (strlen(s) > 60 ? "..." : ""));
14769         } );
14770
14771     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14772 }
14773
14774 /*
14775    pregfree()
14776
14777    handles refcounting and freeing the perl core regexp structure. When
14778    it is necessary to actually free the structure the first thing it
14779    does is call the 'free' method of the regexp_engine associated to
14780    the regexp, allowing the handling of the void *pprivate; member
14781    first. (This routine is not overridable by extensions, which is why
14782    the extensions free is called first.)
14783
14784    See regdupe and regdupe_internal if you change anything here.
14785 */
14786 #ifndef PERL_IN_XSUB_RE
14787 void
14788 Perl_pregfree(pTHX_ REGEXP *r)
14789 {
14790     SvREFCNT_dec(r);
14791 }
14792
14793 void
14794 Perl_pregfree2(pTHX_ REGEXP *rx)
14795 {
14796     dVAR;
14797     struct regexp *const r = ReANY(rx);
14798     GET_RE_DEBUG_FLAGS_DECL;
14799
14800     PERL_ARGS_ASSERT_PREGFREE2;
14801
14802     if (r->mother_re) {
14803         ReREFCNT_dec(r->mother_re);
14804     } else {
14805         CALLREGFREE_PVT(rx); /* free the private data */
14806         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14807         Safefree(r->xpv_len_u.xpvlenu_pv);
14808     }
14809     if (r->substrs) {
14810         SvREFCNT_dec(r->anchored_substr);
14811         SvREFCNT_dec(r->anchored_utf8);
14812         SvREFCNT_dec(r->float_substr);
14813         SvREFCNT_dec(r->float_utf8);
14814         Safefree(r->substrs);
14815     }
14816     RX_MATCH_COPY_FREE(rx);
14817 #ifdef PERL_ANY_COW
14818     SvREFCNT_dec(r->saved_copy);
14819 #endif
14820     Safefree(r->offs);
14821     SvREFCNT_dec(r->qr_anoncv);
14822     rx->sv_u.svu_rx = 0;
14823 }
14824
14825 /*  reg_temp_copy()
14826
14827     This is a hacky workaround to the structural issue of match results
14828     being stored in the regexp structure which is in turn stored in
14829     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14830     could be PL_curpm in multiple contexts, and could require multiple
14831     result sets being associated with the pattern simultaneously, such
14832     as when doing a recursive match with (??{$qr})
14833
14834     The solution is to make a lightweight copy of the regexp structure
14835     when a qr// is returned from the code executed by (??{$qr}) this
14836     lightweight copy doesn't actually own any of its data except for
14837     the starp/end and the actual regexp structure itself.
14838
14839 */
14840
14841
14842 REGEXP *
14843 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14844 {
14845     struct regexp *ret;
14846     struct regexp *const r = ReANY(rx);
14847     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14848
14849     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14850
14851     if (!ret_x)
14852         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14853     else {
14854         SvOK_off((SV *)ret_x);
14855         if (islv) {
14856             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14857                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14858                made both spots point to the same regexp body.) */
14859             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14860             assert(!SvPVX(ret_x));
14861             ret_x->sv_u.svu_rx = temp->sv_any;
14862             temp->sv_any = NULL;
14863             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14864             SvREFCNT_dec_NN(temp);
14865             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14866                ing below will not set it. */
14867             SvCUR_set(ret_x, SvCUR(rx));
14868         }
14869     }
14870     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14871        sv_force_normal(sv) is called.  */
14872     SvFAKE_on(ret_x);
14873     ret = ReANY(ret_x);
14874
14875     SvFLAGS(ret_x) |= SvUTF8(rx);
14876     /* We share the same string buffer as the original regexp, on which we
14877        hold a reference count, incremented when mother_re is set below.
14878        The string pointer is copied here, being part of the regexp struct.
14879      */
14880     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14881            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14882     if (r->offs) {
14883         const I32 npar = r->nparens+1;
14884         Newx(ret->offs, npar, regexp_paren_pair);
14885         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14886     }
14887     if (r->substrs) {
14888         Newx(ret->substrs, 1, struct reg_substr_data);
14889         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14890
14891         SvREFCNT_inc_void(ret->anchored_substr);
14892         SvREFCNT_inc_void(ret->anchored_utf8);
14893         SvREFCNT_inc_void(ret->float_substr);
14894         SvREFCNT_inc_void(ret->float_utf8);
14895
14896         /* check_substr and check_utf8, if non-NULL, point to either their
14897            anchored or float namesakes, and don't hold a second reference.  */
14898     }
14899     RX_MATCH_COPIED_off(ret_x);
14900 #ifdef PERL_ANY_COW
14901     ret->saved_copy = NULL;
14902 #endif
14903     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14904     SvREFCNT_inc_void(ret->qr_anoncv);
14905
14906     return ret_x;
14907 }
14908 #endif
14909
14910 /* regfree_internal()
14911
14912    Free the private data in a regexp. This is overloadable by
14913    extensions. Perl takes care of the regexp structure in pregfree(),
14914    this covers the *pprivate pointer which technically perl doesn't
14915    know about, however of course we have to handle the
14916    regexp_internal structure when no extension is in use.
14917
14918    Note this is called before freeing anything in the regexp
14919    structure.
14920  */
14921
14922 void
14923 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14924 {
14925     dVAR;
14926     struct regexp *const r = ReANY(rx);
14927     RXi_GET_DECL(r,ri);
14928     GET_RE_DEBUG_FLAGS_DECL;
14929
14930     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14931
14932     DEBUG_COMPILE_r({
14933         if (!PL_colorset)
14934             reginitcolors();
14935         {
14936             SV *dsv= sv_newmortal();
14937             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14938                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14939             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14940                 PL_colors[4],PL_colors[5],s);
14941         }
14942     });
14943 #ifdef RE_TRACK_PATTERN_OFFSETS
14944     if (ri->u.offsets)
14945         Safefree(ri->u.offsets);             /* 20010421 MJD */
14946 #endif
14947     if (ri->code_blocks) {
14948         int n;
14949         for (n = 0; n < ri->num_code_blocks; n++)
14950             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14951         Safefree(ri->code_blocks);
14952     }
14953
14954     if (ri->data) {
14955         int n = ri->data->count;
14956
14957         while (--n >= 0) {
14958           /* If you add a ->what type here, update the comment in regcomp.h */
14959             switch (ri->data->what[n]) {
14960             case 'a':
14961             case 'r':
14962             case 's':
14963             case 'S':
14964             case 'u':
14965                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14966                 break;
14967             case 'f':
14968                 Safefree(ri->data->data[n]);
14969                 break;
14970             case 'l':
14971             case 'L':
14972                 break;
14973             case 'T':
14974                 { /* Aho Corasick add-on structure for a trie node.
14975                      Used in stclass optimization only */
14976                     U32 refcount;
14977                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14978                     OP_REFCNT_LOCK;
14979                     refcount = --aho->refcount;
14980                     OP_REFCNT_UNLOCK;
14981                     if ( !refcount ) {
14982                         PerlMemShared_free(aho->states);
14983                         PerlMemShared_free(aho->fail);
14984                          /* do this last!!!! */
14985                         PerlMemShared_free(ri->data->data[n]);
14986                         PerlMemShared_free(ri->regstclass);
14987                     }
14988                 }
14989                 break;
14990             case 't':
14991                 {
14992                     /* trie structure. */
14993                     U32 refcount;
14994                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14995                     OP_REFCNT_LOCK;
14996                     refcount = --trie->refcount;
14997                     OP_REFCNT_UNLOCK;
14998                     if ( !refcount ) {
14999                         PerlMemShared_free(trie->charmap);
15000                         PerlMemShared_free(trie->states);
15001                         PerlMemShared_free(trie->trans);
15002                         if (trie->bitmap)
15003                             PerlMemShared_free(trie->bitmap);
15004                         if (trie->jump)
15005                             PerlMemShared_free(trie->jump);
15006                         PerlMemShared_free(trie->wordinfo);
15007                         /* do this last!!!! */
15008                         PerlMemShared_free(ri->data->data[n]);
15009                     }
15010                 }
15011                 break;
15012             default:
15013                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15014             }
15015         }
15016         Safefree(ri->data->what);
15017         Safefree(ri->data);
15018     }
15019
15020     Safefree(ri);
15021 }
15022
15023 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15024 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15025 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
15026
15027 /*
15028    re_dup - duplicate a regexp.
15029
15030    This routine is expected to clone a given regexp structure. It is only
15031    compiled under USE_ITHREADS.
15032
15033    After all of the core data stored in struct regexp is duplicated
15034    the regexp_engine.dupe method is used to copy any private data
15035    stored in the *pprivate pointer. This allows extensions to handle
15036    any duplication it needs to do.
15037
15038    See pregfree() and regfree_internal() if you change anything here.
15039 */
15040 #if defined(USE_ITHREADS)
15041 #ifndef PERL_IN_XSUB_RE
15042 void
15043 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15044 {
15045     dVAR;
15046     I32 npar;
15047     const struct regexp *r = ReANY(sstr);
15048     struct regexp *ret = ReANY(dstr);
15049
15050     PERL_ARGS_ASSERT_RE_DUP_GUTS;
15051
15052     npar = r->nparens+1;
15053     Newx(ret->offs, npar, regexp_paren_pair);
15054     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15055
15056     if (ret->substrs) {
15057         /* Do it this way to avoid reading from *r after the StructCopy().
15058            That way, if any of the sv_dup_inc()s dislodge *r from the L1
15059            cache, it doesn't matter.  */
15060         const bool anchored = r->check_substr
15061             ? r->check_substr == r->anchored_substr
15062             : r->check_utf8 == r->anchored_utf8;
15063         Newx(ret->substrs, 1, struct reg_substr_data);
15064         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15065
15066         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15067         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15068         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15069         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15070
15071         /* check_substr and check_utf8, if non-NULL, point to either their
15072            anchored or float namesakes, and don't hold a second reference.  */
15073
15074         if (ret->check_substr) {
15075             if (anchored) {
15076                 assert(r->check_utf8 == r->anchored_utf8);
15077                 ret->check_substr = ret->anchored_substr;
15078                 ret->check_utf8 = ret->anchored_utf8;
15079             } else {
15080                 assert(r->check_substr == r->float_substr);
15081                 assert(r->check_utf8 == r->float_utf8);
15082                 ret->check_substr = ret->float_substr;
15083                 ret->check_utf8 = ret->float_utf8;
15084             }
15085         } else if (ret->check_utf8) {
15086             if (anchored) {
15087                 ret->check_utf8 = ret->anchored_utf8;
15088             } else {
15089                 ret->check_utf8 = ret->float_utf8;
15090             }
15091         }
15092     }
15093
15094     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15095     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15096
15097     if (ret->pprivate)
15098         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15099
15100     if (RX_MATCH_COPIED(dstr))
15101         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15102     else
15103         ret->subbeg = NULL;
15104 #ifdef PERL_ANY_COW
15105     ret->saved_copy = NULL;
15106 #endif
15107
15108     /* Whether mother_re be set or no, we need to copy the string.  We
15109        cannot refrain from copying it when the storage points directly to
15110        our mother regexp, because that's
15111                1: a buffer in a different thread
15112                2: something we no longer hold a reference on
15113                so we need to copy it locally.  */
15114     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15115     ret->mother_re   = NULL;
15116     ret->gofs = 0;
15117 }
15118 #endif /* PERL_IN_XSUB_RE */
15119
15120 /*
15121    regdupe_internal()
15122
15123    This is the internal complement to regdupe() which is used to copy
15124    the structure pointed to by the *pprivate pointer in the regexp.
15125    This is the core version of the extension overridable cloning hook.
15126    The regexp structure being duplicated will be copied by perl prior
15127    to this and will be provided as the regexp *r argument, however
15128    with the /old/ structures pprivate pointer value. Thus this routine
15129    may override any copying normally done by perl.
15130
15131    It returns a pointer to the new regexp_internal structure.
15132 */
15133
15134 void *
15135 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15136 {
15137     dVAR;
15138     struct regexp *const r = ReANY(rx);
15139     regexp_internal *reti;
15140     int len;
15141     RXi_GET_DECL(r,ri);
15142
15143     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15144
15145     len = ProgLen(ri);
15146
15147     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15148     Copy(ri->program, reti->program, len+1, regnode);
15149
15150     reti->num_code_blocks = ri->num_code_blocks;
15151     if (ri->code_blocks) {
15152         int n;
15153         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15154                 struct reg_code_block);
15155         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15156                 struct reg_code_block);
15157         for (n = 0; n < ri->num_code_blocks; n++)
15158              reti->code_blocks[n].src_regex = (REGEXP*)
15159                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15160     }
15161     else
15162         reti->code_blocks = NULL;
15163
15164     reti->regstclass = NULL;
15165
15166     if (ri->data) {
15167         struct reg_data *d;
15168         const int count = ri->data->count;
15169         int i;
15170
15171         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15172                 char, struct reg_data);
15173         Newx(d->what, count, U8);
15174
15175         d->count = count;
15176         for (i = 0; i < count; i++) {
15177             d->what[i] = ri->data->what[i];
15178             switch (d->what[i]) {
15179                 /* see also regcomp.h and regfree_internal() */
15180             case 'a': /* actually an AV, but the dup function is identical.  */
15181             case 'r':
15182             case 's':
15183             case 'S':
15184             case 'u': /* actually an HV, but the dup function is identical.  */
15185                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15186                 break;
15187             case 'f':
15188                 /* This is cheating. */
15189                 Newx(d->data[i], 1, struct regnode_charclass_class);
15190                 StructCopy(ri->data->data[i], d->data[i],
15191                             struct regnode_charclass_class);
15192                 reti->regstclass = (regnode*)d->data[i];
15193                 break;
15194             case 'T':
15195                 /* Trie stclasses are readonly and can thus be shared
15196                  * without duplication. We free the stclass in pregfree
15197                  * when the corresponding reg_ac_data struct is freed.
15198                  */
15199                 reti->regstclass= ri->regstclass;
15200                 /* Fall through */
15201             case 't':
15202                 OP_REFCNT_LOCK;
15203                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15204                 OP_REFCNT_UNLOCK;
15205                 /* Fall through */
15206             case 'l':
15207             case 'L':
15208                 d->data[i] = ri->data->data[i];
15209                 break;
15210             default:
15211                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15212             }
15213         }
15214
15215         reti->data = d;
15216     }
15217     else
15218         reti->data = NULL;
15219
15220     reti->name_list_idx = ri->name_list_idx;
15221
15222 #ifdef RE_TRACK_PATTERN_OFFSETS
15223     if (ri->u.offsets) {
15224         Newx(reti->u.offsets, 2*len+1, U32);
15225         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15226     }
15227 #else
15228     SetProgLen(reti,len);
15229 #endif
15230
15231     return (void*)reti;
15232 }
15233
15234 #endif    /* USE_ITHREADS */
15235
15236 #ifndef PERL_IN_XSUB_RE
15237
15238 /*
15239  - regnext - dig the "next" pointer out of a node
15240  */
15241 regnode *
15242 Perl_regnext(pTHX_ regnode *p)
15243 {
15244     dVAR;
15245     I32 offset;
15246
15247     if (!p)
15248         return(NULL);
15249
15250     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15251         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15252     }
15253
15254     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15255     if (offset == 0)
15256         return(NULL);
15257
15258     return(p+offset);
15259 }
15260 #endif
15261
15262 STATIC void
15263 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15264 {
15265     va_list args;
15266     STRLEN l1 = strlen(pat1);
15267     STRLEN l2 = strlen(pat2);
15268     char buf[512];
15269     SV *msv;
15270     const char *message;
15271
15272     PERL_ARGS_ASSERT_RE_CROAK2;
15273
15274     if (l1 > 510)
15275         l1 = 510;
15276     if (l1 + l2 > 510)
15277         l2 = 510 - l1;
15278     Copy(pat1, buf, l1 , char);
15279     Copy(pat2, buf + l1, l2 , char);
15280     buf[l1 + l2] = '\n';
15281     buf[l1 + l2 + 1] = '\0';
15282 #ifdef I_STDARG
15283     /* ANSI variant takes additional second argument */
15284     va_start(args, pat2);
15285 #else
15286     va_start(args);
15287 #endif
15288     msv = vmess(buf, &args);
15289     va_end(args);
15290     message = SvPV_const(msv,l1);
15291     if (l1 > 512)
15292         l1 = 512;
15293     Copy(message, buf, l1 , char);
15294     buf[l1-1] = '\0';                   /* Overwrite \n */
15295     Perl_croak(aTHX_ "%s", buf);
15296 }
15297
15298 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15299
15300 #ifndef PERL_IN_XSUB_RE
15301 void
15302 Perl_save_re_context(pTHX)
15303 {
15304     dVAR;
15305
15306     struct re_save_state *state;
15307
15308     SAVEVPTR(PL_curcop);
15309     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15310
15311     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15312     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15313     SSPUSHUV(SAVEt_RE_STATE);
15314
15315     Copy(&PL_reg_state, state, 1, struct re_save_state);
15316
15317     PL_reg_oldsaved = NULL;
15318     PL_reg_oldsavedlen = 0;
15319     PL_reg_oldsavedoffset = 0;
15320     PL_reg_oldsavedcoffset = 0;
15321     PL_reg_maxiter = 0;
15322     PL_reg_leftiter = 0;
15323     PL_reg_poscache = NULL;
15324     PL_reg_poscache_size = 0;
15325 #ifdef PERL_ANY_COW
15326     PL_nrs = NULL;
15327 #endif
15328
15329     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15330     if (PL_curpm) {
15331         const REGEXP * const rx = PM_GETRE(PL_curpm);
15332         if (rx) {
15333             U32 i;
15334             for (i = 1; i <= RX_NPARENS(rx); i++) {
15335                 char digits[TYPE_CHARS(long)];
15336                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15337                 GV *const *const gvp
15338                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15339
15340                 if (gvp) {
15341                     GV * const gv = *gvp;
15342                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15343                         save_scalar(gv);
15344                 }
15345             }
15346         }
15347     }
15348 }
15349 #endif
15350
15351 #ifdef DEBUGGING
15352
15353 STATIC void
15354 S_put_byte(pTHX_ SV *sv, int c)
15355 {
15356     PERL_ARGS_ASSERT_PUT_BYTE;
15357
15358     /* Our definition of isPRINT() ignores locales, so only bytes that are
15359        not part of UTF-8 are considered printable. I assume that the same
15360        holds for UTF-EBCDIC.
15361        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15362        which Wikipedia says:
15363
15364        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15365        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15366        identical, to the ASCII delete (DEL) or rubout control character. ...
15367        it is typically mapped to hexadecimal code 9F, in order to provide a
15368        unique character mapping in both directions)
15369
15370        So the old condition can be simplified to !isPRINT(c)  */
15371     if (!isPRINT(c)) {
15372         if (c < 256) {
15373             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15374         }
15375         else {
15376             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15377         }
15378     }
15379     else {
15380         const char string = c;
15381         if (c == '-' || c == ']' || c == '\\' || c == '^')
15382             sv_catpvs(sv, "\\");
15383         sv_catpvn(sv, &string, 1);
15384     }
15385 }
15386
15387
15388 #define CLEAR_OPTSTART \
15389     if (optstart) STMT_START { \
15390             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15391             optstart=NULL; \
15392     } STMT_END
15393
15394 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15395
15396 STATIC const regnode *
15397 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15398             const regnode *last, const regnode *plast,
15399             SV* sv, I32 indent, U32 depth)
15400 {
15401     dVAR;
15402     U8 op = PSEUDO;     /* Arbitrary non-END op. */
15403     const regnode *next;
15404     const regnode *optstart= NULL;
15405
15406     RXi_GET_DECL(r,ri);
15407     GET_RE_DEBUG_FLAGS_DECL;
15408
15409     PERL_ARGS_ASSERT_DUMPUNTIL;
15410
15411 #ifdef DEBUG_DUMPUNTIL
15412     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15413         last ? last-start : 0,plast ? plast-start : 0);
15414 #endif
15415
15416     if (plast && plast < last)
15417         last= plast;
15418
15419     while (PL_regkind[op] != END && (!last || node < last)) {
15420         /* While that wasn't END last time... */
15421         NODE_ALIGN(node);
15422         op = OP(node);
15423         if (op == CLOSE || op == WHILEM)
15424             indent--;
15425         next = regnext((regnode *)node);
15426
15427         /* Where, what. */
15428         if (OP(node) == OPTIMIZED) {
15429             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15430                 optstart = node;
15431             else
15432                 goto after_print;
15433         } else
15434             CLEAR_OPTSTART;
15435
15436         regprop(r, sv, node);
15437         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15438                       (int)(2*indent + 1), "", SvPVX_const(sv));
15439
15440         if (OP(node) != OPTIMIZED) {
15441             if (next == NULL)           /* Next ptr. */
15442                 PerlIO_printf(Perl_debug_log, " (0)");
15443             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15444                 PerlIO_printf(Perl_debug_log, " (FAIL)");
15445             else
15446                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15447             (void)PerlIO_putc(Perl_debug_log, '\n');
15448         }
15449
15450       after_print:
15451         if (PL_regkind[(U8)op] == BRANCHJ) {
15452             assert(next);
15453             {
15454                 const regnode *nnode = (OP(next) == LONGJMP
15455                                        ? regnext((regnode *)next)
15456                                        : next);
15457                 if (last && nnode > last)
15458                     nnode = last;
15459                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15460             }
15461         }
15462         else if (PL_regkind[(U8)op] == BRANCH) {
15463             assert(next);
15464             DUMPUNTIL(NEXTOPER(node), next);
15465         }
15466         else if ( PL_regkind[(U8)op]  == TRIE ) {
15467             const regnode *this_trie = node;
15468             const char op = OP(node);
15469             const U32 n = ARG(node);
15470             const reg_ac_data * const ac = op>=AHOCORASICK ?
15471                (reg_ac_data *)ri->data->data[n] :
15472                NULL;
15473             const reg_trie_data * const trie =
15474                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15475 #ifdef DEBUGGING
15476             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15477 #endif
15478             const regnode *nextbranch= NULL;
15479             I32 word_idx;
15480             sv_setpvs(sv, "");
15481             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15482                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15483
15484                 PerlIO_printf(Perl_debug_log, "%*s%s ",
15485                    (int)(2*(indent+3)), "",
15486                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15487                             PL_colors[0], PL_colors[1],
15488                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15489                             PERL_PV_PRETTY_ELLIPSES    |
15490                             PERL_PV_PRETTY_LTGT
15491                             )
15492                             : "???"
15493                 );
15494                 if (trie->jump) {
15495                     U16 dist= trie->jump[word_idx+1];
15496                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15497                                   (UV)((dist ? this_trie + dist : next) - start));
15498                     if (dist) {
15499                         if (!nextbranch)
15500                             nextbranch= this_trie + trie->jump[0];
15501                         DUMPUNTIL(this_trie + dist, nextbranch);
15502                     }
15503                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15504                         nextbranch= regnext((regnode *)nextbranch);
15505                 } else {
15506                     PerlIO_printf(Perl_debug_log, "\n");
15507                 }
15508             }
15509             if (last && next > last)
15510                 node= last;
15511             else
15512                 node= next;
15513         }
15514         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15515             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15516                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15517         }
15518         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15519             assert(next);
15520             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15521         }
15522         else if ( op == PLUS || op == STAR) {
15523             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15524         }
15525         else if (PL_regkind[(U8)op] == ANYOF) {
15526             /* arglen 1 + class block */
15527             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15528                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15529             node = NEXTOPER(node);
15530         }
15531         else if (PL_regkind[(U8)op] == EXACT) {
15532             /* Literal string, where present. */
15533             node += NODE_SZ_STR(node) - 1;
15534             node = NEXTOPER(node);
15535         }
15536         else {
15537             node = NEXTOPER(node);
15538             node += regarglen[(U8)op];
15539         }
15540         if (op == CURLYX || op == OPEN)
15541             indent++;
15542     }
15543     CLEAR_OPTSTART;
15544 #ifdef DEBUG_DUMPUNTIL
15545     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15546 #endif
15547     return node;
15548 }
15549
15550 #endif  /* DEBUGGING */
15551
15552 /*
15553  * Local variables:
15554  * c-indentation-style: bsd
15555  * c-basic-offset: 4
15556  * indent-tabs-mode: nil
15557  * End:
15558  *
15559  * ex: set ts=8 sts=4 sw=4 et:
15560  */