src/5017003/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91
  92 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  94
  95 #ifdef op
  96 #undef op
  97 #endif /* op */
  98
  99 #ifdef MSDOS
 100 #  if defined(BUGGY_MSC6)
 101  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 102 #    pragma optimize("a",off)
 103  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 104 #    pragma optimize("w",on )
 105 #  endif /* BUGGY_MSC6 */
 106 #endif /* MSDOS */
 107
 108 #ifndef STATIC
 109 #define STATIC  static
 110 #endif
 111
 112
 113 typedef struct RExC_state_t {
 114     U32         flags;                  /* RXf_* are we folding, multilining? */
 115     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 116     char        *precomp;               /* uncompiled string. */
 117     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 118     regexp      *rx;                    /* perl core regexp structure */
 119     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 120     char        *start;                 /* Start of input for compile */
 121     char        *end;                   /* End of input for compile */
 122     char        *parse;                 /* Input-scan pointer. */
 123     I32         whilem_seen;            /* number of WHILEM in this expr */
 124     regnode     *emit_start;            /* Start of emitted-code area */
 125     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 126     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 127     I32         naughty;                /* How bad is this pattern? */
 128     I32         sawback;                /* Did we see \1, ...? */
 129     U32         seen;
 130     I32         size;                   /* Code size. */
 131     I32         npar;                   /* Capture buffer count, (OPEN). */
 132     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 133     I32         nestroot;               /* root parens we are in - used by accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     I32         in_lookbehind;
 151     I32         contains_locale;
 152     I32         override_recoding;
 153     struct reg_code_block *code_blocks; /* positions of literal (?{})
 154                                             within pattern */
 155     int         num_code_blocks;        /* size of code_blocks[] */
 156     int         code_index;             /* next code_blocks[] slot */
 157 #if ADD_TO_REGEXEC
 158     char        *starttry;              /* -Dr: where regtry was called. */
 159 #define RExC_starttry   (pRExC_state->starttry)
 160 #endif
 161     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 162 #ifdef DEBUGGING
 163     const char  *lastparse;
 164     I32         lastnum;
 165     AV          *paren_name_list;       /* idx -> name */
 166 #define RExC_lastparse  (pRExC_state->lastparse)
 167 #define RExC_lastnum    (pRExC_state->lastnum)
 168 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 169 #endif
 170 } RExC_state_t;
 171
 172 #define RExC_flags      (pRExC_state->flags)
 173 #define RExC_pm_flags   (pRExC_state->pm_flags)
 174 #define RExC_precomp    (pRExC_state->precomp)
 175 #define RExC_rx_sv      (pRExC_state->rx_sv)
 176 #define RExC_rx         (pRExC_state->rx)
 177 #define RExC_rxi        (pRExC_state->rxi)
 178 #define RExC_start      (pRExC_state->start)
 179 #define RExC_end        (pRExC_state->end)
 180 #define RExC_parse      (pRExC_state->parse)
 181 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 182 #ifdef RE_TRACK_PATTERN_OFFSETS
 183 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 184 #endif
 185 #define RExC_emit       (pRExC_state->emit)
 186 #define RExC_emit_start (pRExC_state->emit_start)
 187 #define RExC_emit_bound (pRExC_state->emit_bound)
 188 #define RExC_naughty    (pRExC_state->naughty)
 189 #define RExC_sawback    (pRExC_state->sawback)
 190 #define RExC_seen       (pRExC_state->seen)
 191 #define RExC_size       (pRExC_state->size)
 192 #define RExC_npar       (pRExC_state->npar)
 193 #define RExC_nestroot   (pRExC_state->nestroot)
 194 #define RExC_extralen   (pRExC_state->extralen)
 195 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 196 #define RExC_utf8       (pRExC_state->utf8)
 197 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 198 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 199 #define RExC_open_parens        (pRExC_state->open_parens)
 200 #define RExC_close_parens       (pRExC_state->close_parens)
 201 #define RExC_opend      (pRExC_state->opend)
 202 #define RExC_paren_names        (pRExC_state->paren_names)
 203 #define RExC_recurse    (pRExC_state->recurse)
 204 #define RExC_recurse_count      (pRExC_state->recurse_count)
 205 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 206 #define RExC_contains_locale    (pRExC_state->contains_locale)
 207 #define RExC_override_recoding  (pRExC_state->override_recoding)
 208
 209
 210 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 211 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 212         ((*s) == '{' && regcurly(s)))
 213
 214 #ifdef SPSTART
 215 #undef SPSTART          /* dratted cpp namespace... */
 216 #endif
 217 /*
 218  * Flags to be passed up and down.
 219  */
 220 #define WORST           0       /* Worst case. */
 221 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 222
 223 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
 224  * character, and if utf8, must be invariant.  Note that this is not the same
 225  * thing as REGNODE_SIMPLE */
 226 #define SIMPLE          0x02
 227 #define SPSTART         0x04    /* Starts with * or +. */
 228 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 229 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 230
 231 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 232
 233 /* whether trie related optimizations are enabled */
 234 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 235 #define TRIE_STUDY_OPT
 236 #define FULL_TRIE_STUDY
 237 #define TRIE_STCLASS
 238 #endif
 239
 240
 241
 242 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 243 #define PBITVAL(paren) (1 << ((paren) & 7))
 244 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 245 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 246 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 247
 248 /* If not already in utf8, do a longjmp back to the beginning */
 249 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 250 #define REQUIRE_UTF8    STMT_START {                                       \
 251                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 252                         } STMT_END
 253
 254 /* About scan_data_t.
 255
 256   During optimisation we recurse through the regexp program performing
 257   various inplace (keyhole style) optimisations. In addition study_chunk
 258   and scan_commit populate this data structure with information about
 259   what strings MUST appear in the pattern. We look for the longest
 260   string that must appear at a fixed location, and we look for the
 261   longest string that may appear at a floating location. So for instance
 262   in the pattern:
 263
 264     /FOO[xX]A.*B[xX]BAR/
 265
 266   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 267   strings (because they follow a .* construct). study_chunk will identify
 268   both FOO and BAR as being the longest fixed and floating strings respectively.
 269
 270   The strings can be composites, for instance
 271
 272      /(f)(o)(o)/
 273
 274   will result in a composite fixed substring 'foo'.
 275
 276   For each string some basic information is maintained:
 277
 278   - offset or min_offset
 279     This is the position the string must appear at, or not before.
 280     It also implicitly (when combined with minlenp) tells us how many
 281     characters must match before the string we are searching for.
 282     Likewise when combined with minlenp and the length of the string it
 283     tells us how many characters must appear after the string we have
 284     found.
 285
 286   - max_offset
 287     Only used for floating strings. This is the rightmost point that
 288     the string can appear at. If set to I32 max it indicates that the
 289     string can occur infinitely far to the right.
 290
 291   - minlenp
 292     A pointer to the minimum length of the pattern that the string
 293     was found inside. This is important as in the case of positive
 294     lookahead or positive lookbehind we can have multiple patterns
 295     involved. Consider
 296
 297     /(?=FOO).*F/
 298
 299     The minimum length of the pattern overall is 3, the minimum length
 300     of the lookahead part is 3, but the minimum length of the part that
 301     will actually match is 1. So 'FOO's minimum length is 3, but the
 302     minimum length for the F is 1. This is important as the minimum length
 303     is used to determine offsets in front of and behind the string being
 304     looked for.  Since strings can be composites this is the length of the
 305     pattern at the time it was committed with a scan_commit. Note that
 306     the length is calculated by study_chunk, so that the minimum lengths
 307     are not known until the full pattern has been compiled, thus the
 308     pointer to the value.
 309
 310   - lookbehind
 311
 312     In the case of lookbehind the string being searched for can be
 313     offset past the start point of the final matching string.
 314     If this value was just blithely removed from the min_offset it would
 315     invalidate some of the calculations for how many chars must match
 316     before or after (as they are derived from min_offset and minlen and
 317     the length of the string being searched for).
 318     When the final pattern is compiled and the data is moved from the
 319     scan_data_t structure into the regexp structure the information
 320     about lookbehind is factored in, with the information that would
 321     have been lost precalculated in the end_shift field for the
 322     associated string.
 323
 324   The fields pos_min and pos_delta are used to store the minimum offset
 325   and the delta to the maximum offset at the current point in the pattern.
 326
 327 */
 328
 329 typedef struct scan_data_t {
 330     /*I32 len_min;      unused */
 331     /*I32 len_delta;    unused */
 332     I32 pos_min;
 333     I32 pos_delta;
 334     SV *last_found;
 335     I32 last_end;           /* min value, <0 unless valid. */
 336     I32 last_start_min;
 337     I32 last_start_max;
 338     SV **longest;           /* Either &l_fixed, or &l_float. */
 339     SV *longest_fixed;      /* longest fixed string found in pattern */
 340     I32 offset_fixed;       /* offset where it starts */
 341     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 342     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 343     SV *longest_float;      /* longest floating string found in pattern */
 344     I32 offset_float_min;   /* earliest point in string it can appear */
 345     I32 offset_float_max;   /* latest point in string it can appear */
 346     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 347     I32 lookbehind_float;   /* is the position of the string modified by LB */
 348     I32 flags;
 349     I32 whilem_c;
 350     I32 *last_closep;
 351     struct regnode_charclass_class *start_class;
 352 } scan_data_t;
 353
 354 /*
 355  * Forward declarations for pregcomp()'s friends.
 356  */
 357
 358 static const scan_data_t zero_scan_data =
 359   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 360
 361 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 362 #define SF_BEFORE_SEOL          0x0001
 363 #define SF_BEFORE_MEOL          0x0002
 364 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 365 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 366
 367 #ifdef NO_UNARY_PLUS
 368 #  define SF_FIX_SHIFT_EOL      (0+2)
 369 #  define SF_FL_SHIFT_EOL               (0+4)
 370 #else
 371 #  define SF_FIX_SHIFT_EOL      (+2)
 372 #  define SF_FL_SHIFT_EOL               (+4)
 373 #endif
 374
 375 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 376 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 377
 378 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 379 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 380 #define SF_IS_INF               0x0040
 381 #define SF_HAS_PAR              0x0080
 382 #define SF_IN_PAR               0x0100
 383 #define SF_HAS_EVAL             0x0200
 384 #define SCF_DO_SUBSTR           0x0400
 385 #define SCF_DO_STCLASS_AND      0x0800
 386 #define SCF_DO_STCLASS_OR       0x1000
 387 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 388 #define SCF_WHILEM_VISITED_POS  0x2000
 389
 390 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 391 #define SCF_SEEN_ACCEPT         0x8000
 392
 393 #define UTF cBOOL(RExC_utf8)
 394
 395 /* The enums for all these are ordered so things work out correctly */
 396 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 397 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 398 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 399 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 400 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 401 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 402 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 403
 404 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 405
 406 #define OOB_NAMEDCLASS          -1
 407
 408 /* There is no code point that is out-of-bounds, so this is problematic.  But
 409  * its only current use is to initialize a variable that is always set before
 410  * looked at. */
 411 #define OOB_UNICODE             0xDEADBEEF
 412
 413 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 414 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 415
 416
 417 /* length of regex to show in messages that don't mark a position within */
 418 #define RegexLengthToShowInErrorMessages 127
 419
 420 /*
 421  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 422  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 423  * op/pragma/warn/regcomp.
 424  */
 425 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 426 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 427
 428 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 429
 430 /*
 431  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 432  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 433  * "...".
 434  */
 435 #define _FAIL(code) STMT_START {                                        \
 436     const char *ellipses = "";                                          \
 437     IV len = RExC_end - RExC_precomp;                                   \
 438                                                                         \
 439     if (!SIZE_ONLY)                                                     \
 440         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 441     if (len > RegexLengthToShowInErrorMessages) {                       \
 442         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 443         len = RegexLengthToShowInErrorMessages - 10;                    \
 444         ellipses = "...";                                               \
 445     }                                                                   \
 446     code;                                                               \
 447 } STMT_END
 448
 449 #define FAIL(msg) _FAIL(                            \
 450     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 451             msg, (int)len, RExC_precomp, ellipses))
 452
 453 #define FAIL2(msg,arg) _FAIL(                       \
 454     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 455             arg, (int)len, RExC_precomp, ellipses))
 456
 457 /*
 458  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 459  */
 460 #define Simple_vFAIL(m) STMT_START {                                    \
 461     const IV offset = RExC_parse - RExC_precomp;                        \
 462     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 463             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 464 } STMT_END
 465
 466 /*
 467  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 468  */
 469 #define vFAIL(m) STMT_START {                           \
 470     if (!SIZE_ONLY)                                     \
 471         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 472     Simple_vFAIL(m);                                    \
 473 } STMT_END
 474
 475 /*
 476  * Like Simple_vFAIL(), but accepts two arguments.
 477  */
 478 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 479     const IV offset = RExC_parse - RExC_precomp;                        \
 480     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 481             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 482 } STMT_END
 483
 484 /*
 485  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 486  */
 487 #define vFAIL2(m,a1) STMT_START {                       \
 488     if (!SIZE_ONLY)                                     \
 489         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 490     Simple_vFAIL2(m, a1);                               \
 491 } STMT_END
 492
 493
 494 /*
 495  * Like Simple_vFAIL(), but accepts three arguments.
 496  */
 497 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 498     const IV offset = RExC_parse - RExC_precomp;                \
 499     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 500             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 501 } STMT_END
 502
 503 /*
 504  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 505  */
 506 #define vFAIL3(m,a1,a2) STMT_START {                    \
 507     if (!SIZE_ONLY)                                     \
 508         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 509     Simple_vFAIL3(m, a1, a2);                           \
 510 } STMT_END
 511
 512 /*
 513  * Like Simple_vFAIL(), but accepts four arguments.
 514  */
 515 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 516     const IV offset = RExC_parse - RExC_precomp;                \
 517     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 518             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 519 } STMT_END
 520
 521 #define ckWARNreg(loc,m) STMT_START {                                   \
 522     const IV offset = loc - RExC_precomp;                               \
 523     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 524             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 525 } STMT_END
 526
 527 #define ckWARNregdep(loc,m) STMT_START {                                \
 528     const IV offset = loc - RExC_precomp;                               \
 529     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 530             m REPORT_LOCATION,                                          \
 531             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 532 } STMT_END
 533
 534 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 535     const IV offset = loc - RExC_precomp;                               \
 536     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 537             m REPORT_LOCATION,                                          \
 538             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 539 } STMT_END
 540
 541 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 542     const IV offset = loc - RExC_precomp;                               \
 543     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 544             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 545 } STMT_END
 546
 547 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 548     const IV offset = loc - RExC_precomp;                               \
 549     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 550             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 551 } STMT_END
 552
 553 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 554     const IV offset = loc - RExC_precomp;                               \
 555     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 556             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 557 } STMT_END
 558
 559 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 560     const IV offset = loc - RExC_precomp;                               \
 561     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 562             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 563 } STMT_END
 564
 565 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 566     const IV offset = loc - RExC_precomp;                               \
 567     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 568             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 569 } STMT_END
 570
 571 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 572     const IV offset = loc - RExC_precomp;                               \
 573     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 574             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 575 } STMT_END
 576
 577
 578 /* Allow for side effects in s */
 579 #define REGC(c,s) STMT_START {                  \
 580     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 581 } STMT_END
 582
 583 /* Macros for recording node offsets.   20001227 mjd@plover.com
 584  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 585  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 586  * Element 0 holds the number n.
 587  * Position is 1 indexed.
 588  */
 589 #ifndef RE_TRACK_PATTERN_OFFSETS
 590 #define Set_Node_Offset_To_R(node,byte)
 591 #define Set_Node_Offset(node,byte)
 592 #define Set_Cur_Node_Offset
 593 #define Set_Node_Length_To_R(node,len)
 594 #define Set_Node_Length(node,len)
 595 #define Set_Node_Cur_Length(node)
 596 #define Node_Offset(n)
 597 #define Node_Length(n)
 598 #define Set_Node_Offset_Length(node,offset,len)
 599 #define ProgLen(ri) ri->u.proglen
 600 #define SetProgLen(ri,x) ri->u.proglen = x
 601 #else
 602 #define ProgLen(ri) ri->u.offsets[0]
 603 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 604 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 605     if (! SIZE_ONLY) {                                                  \
 606         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 607                     __LINE__, (int)(node), (int)(byte)));               \
 608         if((node) < 0) {                                                \
 609             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 610         } else {                                                        \
 611             RExC_offsets[2*(node)-1] = (byte);                          \
 612         }                                                               \
 613     }                                                                   \
 614 } STMT_END
 615
 616 #define Set_Node_Offset(node,byte) \
 617     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 618 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 619
 620 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 621     if (! SIZE_ONLY) {                                                  \
 622         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 623                 __LINE__, (int)(node), (int)(len)));                    \
 624         if((node) < 0) {                                                \
 625             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 626         } else {                                                        \
 627             RExC_offsets[2*(node)] = (len);                             \
 628         }                                                               \
 629     }                                                                   \
 630 } STMT_END
 631
 632 #define Set_Node_Length(node,len) \
 633     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 634 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 635 #define Set_Node_Cur_Length(node) \
 636     Set_Node_Length(node, RExC_parse - parse_start)
 637
 638 /* Get offsets and lengths */
 639 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 640 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 641
 642 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 643     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 644     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 645 } STMT_END
 646 #endif
 647
 648 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 649 #define EXPERIMENTAL_INPLACESCAN
 650 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 651
 652 #define DEBUG_STUDYDATA(str,data,depth)                              \
 653 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 654     PerlIO_printf(Perl_debug_log,                                    \
 655         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 656         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 657         (int)(depth)*2, "",                                          \
 658         (IV)((data)->pos_min),                                       \
 659         (IV)((data)->pos_delta),                                     \
 660         (UV)((data)->flags),                                         \
 661         (IV)((data)->whilem_c),                                      \
 662         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 663         is_inf ? "INF " : ""                                         \
 664     );                                                               \
 665     if ((data)->last_found)                                          \
 666         PerlIO_printf(Perl_debug_log,                                \
 667             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 668             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 669             SvPVX_const((data)->last_found),                         \
 670             (IV)((data)->last_end),                                  \
 671             (IV)((data)->last_start_min),                            \
 672             (IV)((data)->last_start_max),                            \
 673             ((data)->longest &&                                      \
 674              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 675             SvPVX_const((data)->longest_fixed),                      \
 676             (IV)((data)->offset_fixed),                              \
 677             ((data)->longest &&                                      \
 678              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 679             SvPVX_const((data)->longest_float),                      \
 680             (IV)((data)->offset_float_min),                          \
 681             (IV)((data)->offset_float_max)                           \
 682         );                                                           \
 683     PerlIO_printf(Perl_debug_log,"\n");                              \
 684 });
 685
 686 static void clear_re(pTHX_ void *r);
 687
 688 /* Mark that we cannot extend a found fixed substring at this point.
 689    Update the longest found anchored substring and the longest found
 690    floating substrings if needed. */
 691
 692 STATIC void
 693 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 694 {
 695     const STRLEN l = CHR_SVLEN(data->last_found);
 696     const STRLEN old_l = CHR_SVLEN(*data->longest);
 697     GET_RE_DEBUG_FLAGS_DECL;
 698
 699     PERL_ARGS_ASSERT_SCAN_COMMIT;
 700
 701     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 702         SvSetMagicSV(*data->longest, data->last_found);
 703         if (*data->longest == data->longest_fixed) {
 704             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 705             if (data->flags & SF_BEFORE_EOL)
 706                 data->flags
 707                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 708             else
 709                 data->flags &= ~SF_FIX_BEFORE_EOL;
 710             data->minlen_fixed=minlenp;
 711             data->lookbehind_fixed=0;
 712         }
 713         else { /* *data->longest == data->longest_float */
 714             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 715             data->offset_float_max = (l
 716                                       ? data->last_start_max
 717                                       : data->pos_min + data->pos_delta);
 718             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 719                 data->offset_float_max = I32_MAX;
 720             if (data->flags & SF_BEFORE_EOL)
 721                 data->flags
 722                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 723             else
 724                 data->flags &= ~SF_FL_BEFORE_EOL;
 725             data->minlen_float=minlenp;
 726             data->lookbehind_float=0;
 727         }
 728     }
 729     SvCUR_set(data->last_found, 0);
 730     {
 731         SV * const sv = data->last_found;
 732         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 733             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 734             if (mg)
 735                 mg->mg_len = 0;
 736         }
 737     }
 738     data->last_end = -1;
 739     data->flags &= ~SF_BEFORE_EOL;
 740     DEBUG_STUDYDATA("commit: ",data,0);
 741 }
 742
 743 /* Can match anything (initialization) */
 744 STATIC void
 745 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 746 {
 747     PERL_ARGS_ASSERT_CL_ANYTHING;
 748
 749     ANYOF_BITMAP_SETALL(cl);
 750     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 751                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 752
 753     /* If any portion of the regex is to operate under locale rules,
 754      * initialization includes it.  The reason this isn't done for all regexes
 755      * is that the optimizer was written under the assumption that locale was
 756      * all-or-nothing.  Given the complexity and lack of documentation in the
 757      * optimizer, and that there are inadequate test cases for locale, so many
 758      * parts of it may not work properly, it is safest to avoid locale unless
 759      * necessary. */
 760     if (RExC_contains_locale) {
 761         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 762         cl->flags |= ANYOF_LOCALE;
 763     }
 764     else {
 765         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 766     }
 767 }
 768
 769 /* Can match anything (initialization) */
 770 STATIC int
 771 S_cl_is_anything(const struct regnode_charclass_class *cl)
 772 {
 773     int value;
 774
 775     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 776
 777     for (value = 0; value <= ANYOF_MAX; value += 2)
 778         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 779             return 1;
 780     if (!(cl->flags & ANYOF_UNICODE_ALL))
 781         return 0;
 782     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 783         return 0;
 784     return 1;
 785 }
 786
 787 /* Can match anything (initialization) */
 788 STATIC void
 789 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 790 {
 791     PERL_ARGS_ASSERT_CL_INIT;
 792
 793     Zero(cl, 1, struct regnode_charclass_class);
 794     cl->type = ANYOF;
 795     cl_anything(pRExC_state, cl);
 796     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 797 }
 798
 799 /* These two functions currently do the exact same thing */
 800 #define cl_init_zero            S_cl_init
 801
 802 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 803  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 804  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 805 STATIC void
 806 S_cl_and(struct regnode_charclass_class *cl,
 807         const struct regnode_charclass_class *and_with)
 808 {
 809     PERL_ARGS_ASSERT_CL_AND;
 810
 811     assert(and_with->type == ANYOF);
 812
 813     /* I (khw) am not sure all these restrictions are necessary XXX */
 814     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 815         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 816         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 817         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 818         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 819         int i;
 820
 821         if (and_with->flags & ANYOF_INVERT)
 822             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 823                 cl->bitmap[i] &= ~and_with->bitmap[i];
 824         else
 825             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 826                 cl->bitmap[i] &= and_with->bitmap[i];
 827     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 828
 829     if (and_with->flags & ANYOF_INVERT) {
 830
 831         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 832          * aren't affected by the inversion.  Those that are affected are
 833          * handled individually below */
 834         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 835         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 836         cl->flags |= affected_flags;
 837
 838         /* We currently don't know how to deal with things that aren't in the
 839          * bitmap, but we know that the intersection is no greater than what
 840          * is already in cl, so let there be false positives that get sorted
 841          * out after the synthetic start class succeeds, and the node is
 842          * matched for real. */
 843
 844         /* The inversion of these two flags indicate that the resulting
 845          * intersection doesn't have them */
 846         if (and_with->flags & ANYOF_UNICODE_ALL) {
 847             cl->flags &= ~ANYOF_UNICODE_ALL;
 848         }
 849         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 850             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 851         }
 852     }
 853     else {   /* and'd node is not inverted */
 854         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 855
 856         if (! ANYOF_NONBITMAP(and_with)) {
 857
 858             /* Here 'and_with' doesn't match anything outside the bitmap
 859              * (except possibly ANYOF_UNICODE_ALL), which means the
 860              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 861              * which case we don't know what the intersection is, but it's no
 862              * greater than what cl already has, so can just leave it alone,
 863              * with possible false positives */
 864             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 865                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 866                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 867             }
 868         }
 869         else if (! ANYOF_NONBITMAP(cl)) {
 870
 871             /* Here, 'and_with' does match something outside the bitmap, and cl
 872              * doesn't have a list of things to match outside the bitmap.  If
 873              * cl can match all code points above 255, the intersection will
 874              * be those above-255 code points that 'and_with' matches.  If cl
 875              * can't match all Unicode code points, it means that it can't
 876              * match anything outside the bitmap (since the 'if' that got us
 877              * into this block tested for that), so we leave the bitmap empty.
 878              */
 879             if (cl->flags & ANYOF_UNICODE_ALL) {
 880                 ARG_SET(cl, ARG(and_with));
 881
 882                 /* and_with's ARG may match things that don't require UTF8.
 883                  * And now cl's will too, in spite of this being an 'and'.  See
 884                  * the comments below about the kludge */
 885                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 886             }
 887         }
 888         else {
 889             /* Here, both 'and_with' and cl match something outside the
 890              * bitmap.  Currently we do not do the intersection, so just match
 891              * whatever cl had at the beginning.  */
 892         }
 893
 894
 895         /* Take the intersection of the two sets of flags.  However, the
 896          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 897          * kludge around the fact that this flag is not treated like the others
 898          * which are initialized in cl_anything().  The way the optimizer works
 899          * is that the synthetic start class (SSC) is initialized to match
 900          * anything, and then the first time a real node is encountered, its
 901          * values are AND'd with the SSC's with the result being the values of
 902          * the real node.  However, there are paths through the optimizer where
 903          * the AND never gets called, so those initialized bits are set
 904          * inappropriately, which is not usually a big deal, as they just cause
 905          * false positives in the SSC, which will just mean a probably
 906          * imperceptible slow down in execution.  However this bit has a
 907          * higher false positive consequence in that it can cause utf8.pm,
 908          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 909          * bigger slowdown and also causes significant extra memory to be used.
 910          * In order to prevent this, the code now takes a different tack.  The
 911          * bit isn't set unless some part of the regular expression needs it,
 912          * but once set it won't get cleared.  This means that these extra
 913          * modules won't get loaded unless there was some path through the
 914          * pattern that would have required them anyway, and  so any false
 915          * positives that occur by not ANDing them out when they could be
 916          * aren't as severe as they would be if we treated this bit like all
 917          * the others */
 918         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 919                                       & ANYOF_NONBITMAP_NON_UTF8;
 920         cl->flags &= and_with->flags;
 921         cl->flags |= outside_bitmap_but_not_utf8;
 922     }
 923 }
 924
 925 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 926  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 927  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 928 STATIC void
 929 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 930 {
 931     PERL_ARGS_ASSERT_CL_OR;
 932
 933     if (or_with->flags & ANYOF_INVERT) {
 934
 935         /* Here, the or'd node is to be inverted.  This means we take the
 936          * complement of everything not in the bitmap, but currently we don't
 937          * know what that is, so give up and match anything */
 938         if (ANYOF_NONBITMAP(or_with)) {
 939             cl_anything(pRExC_state, cl);
 940         }
 941         /* We do not use
 942          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 943          *   <= (B1 | !B2) | (CL1 | !CL2)
 944          * which is wasteful if CL2 is small, but we ignore CL2:
 945          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 946          * XXXX Can we handle case-fold?  Unclear:
 947          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 948          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 949          */
 950         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 951              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 952              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 953             int i;
 954
 955             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 956                 cl->bitmap[i] |= ~or_with->bitmap[i];
 957         } /* XXXX: logic is complicated otherwise */
 958         else {
 959             cl_anything(pRExC_state, cl);
 960         }
 961
 962         /* And, we can just take the union of the flags that aren't affected
 963          * by the inversion */
 964         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 965
 966         /* For the remaining flags:
 967             ANYOF_UNICODE_ALL and inverted means to not match anything above
 968                     255, which means that the union with cl should just be
 969                     what cl has in it, so can ignore this flag
 970             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 971                     is 127-255 to match them, but then invert that, so the
 972                     union with cl should just be what cl has in it, so can
 973                     ignore this flag
 974          */
 975     } else {    /* 'or_with' is not inverted */
 976         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 977         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 978              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 979                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 980             int i;
 981
 982             /* OR char bitmap and class bitmap separately */
 983             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 984                 cl->bitmap[i] |= or_with->bitmap[i];
 985             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 986                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 987                     cl->classflags[i] |= or_with->classflags[i];
 988                 cl->flags |= ANYOF_CLASS;
 989             }
 990         }
 991         else { /* XXXX: logic is complicated, leave it along for a moment. */
 992             cl_anything(pRExC_state, cl);
 993         }
 994
 995         if (ANYOF_NONBITMAP(or_with)) {
 996
 997             /* Use the added node's outside-the-bit-map match if there isn't a
 998              * conflict.  If there is a conflict (both nodes match something
 999              * outside the bitmap, but what they match outside is not the same
1000              * pointer, and hence not easily compared until XXX we extend
1001              * inversion lists this far), give up and allow the start class to
1002              * match everything outside the bitmap.  If that stuff is all above
1003              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1004             if (! ANYOF_NONBITMAP(cl)) {
1005                 ARG_SET(cl, ARG(or_with));
1006             }
1007             else if (ARG(cl) != ARG(or_with)) {
1008
1009                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1010                     cl_anything(pRExC_state, cl);
1011                 }
1012                 else {
1013                     cl->flags |= ANYOF_UNICODE_ALL;
1014                 }
1015             }
1016         }
1017
1018         /* Take the union */
1019         cl->flags |= or_with->flags;
1020     }
1021 }
1022
1023 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1024 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1025 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1026 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1027
1028
1029 #ifdef DEBUGGING
1030 /*
1031    dump_trie(trie,widecharmap,revcharmap)
1032    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1033    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1034
1035    These routines dump out a trie in a somewhat readable format.
1036    The _interim_ variants are used for debugging the interim
1037    tables that are used to generate the final compressed
1038    representation which is what dump_trie expects.
1039
1040    Part of the reason for their existence is to provide a form
1041    of documentation as to how the different representations function.
1042
1043 */
1044
1045 /*
1046   Dumps the final compressed table form of the trie to Perl_debug_log.
1047   Used for debugging make_trie().
1048 */
1049
1050 STATIC void
1051 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1052             AV *revcharmap, U32 depth)
1053 {
1054     U32 state;
1055     SV *sv=sv_newmortal();
1056     int colwidth= widecharmap ? 6 : 4;
1057     U16 word;
1058     GET_RE_DEBUG_FLAGS_DECL;
1059
1060     PERL_ARGS_ASSERT_DUMP_TRIE;
1061
1062     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1063         (int)depth * 2 + 2,"",
1064         "Match","Base","Ofs" );
1065
1066     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1067         SV ** const tmp = av_fetch( revcharmap, state, 0);
1068         if ( tmp ) {
1069             PerlIO_printf( Perl_debug_log, "%*s",
1070                 colwidth,
1071                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1072                             PL_colors[0], PL_colors[1],
1073                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1074                             PERL_PV_ESCAPE_FIRSTCHAR
1075                 )
1076             );
1077         }
1078     }
1079     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1080         (int)depth * 2 + 2,"");
1081
1082     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1083         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1084     PerlIO_printf( Perl_debug_log, "\n");
1085
1086     for( state = 1 ; state < trie->statecount ; state++ ) {
1087         const U32 base = trie->states[ state ].trans.base;
1088
1089         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1090
1091         if ( trie->states[ state ].wordnum ) {
1092             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1093         } else {
1094             PerlIO_printf( Perl_debug_log, "%6s", "" );
1095         }
1096
1097         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1098
1099         if ( base ) {
1100             U32 ofs = 0;
1101
1102             while( ( base + ofs  < trie->uniquecharcount ) ||
1103                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1104                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1105                     ofs++;
1106
1107             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1108
1109             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1110                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1111                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1112                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1113                 {
1114                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1115                     colwidth,
1116                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1117                 } else {
1118                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1119                 }
1120             }
1121
1122             PerlIO_printf( Perl_debug_log, "]");
1123
1124         }
1125         PerlIO_printf( Perl_debug_log, "\n" );
1126     }
1127     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1128     for (word=1; word <= trie->wordcount; word++) {
1129         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1130             (int)word, (int)(trie->wordinfo[word].prev),
1131             (int)(trie->wordinfo[word].len));
1132     }
1133     PerlIO_printf(Perl_debug_log, "\n" );
1134 }
1135 /*
1136   Dumps a fully constructed but uncompressed trie in list form.
1137   List tries normally only are used for construction when the number of
1138   possible chars (trie->uniquecharcount) is very high.
1139   Used for debugging make_trie().
1140 */
1141 STATIC void
1142 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1143                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1144                          U32 depth)
1145 {
1146     U32 state;
1147     SV *sv=sv_newmortal();
1148     int colwidth= widecharmap ? 6 : 4;
1149     GET_RE_DEBUG_FLAGS_DECL;
1150
1151     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1152
1153     /* print out the table precompression.  */
1154     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1155         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1156         "------:-----+-----------------\n" );
1157
1158     for( state=1 ; state < next_alloc ; state ++ ) {
1159         U16 charid;
1160
1161         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1162             (int)depth * 2 + 2,"", (UV)state  );
1163         if ( ! trie->states[ state ].wordnum ) {
1164             PerlIO_printf( Perl_debug_log, "%5s| ","");
1165         } else {
1166             PerlIO_printf( Perl_debug_log, "W%4x| ",
1167                 trie->states[ state ].wordnum
1168             );
1169         }
1170         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1171             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1172             if ( tmp ) {
1173                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1174                     colwidth,
1175                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1176                             PL_colors[0], PL_colors[1],
1177                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1178                             PERL_PV_ESCAPE_FIRSTCHAR
1179                     ) ,
1180                     TRIE_LIST_ITEM(state,charid).forid,
1181                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1182                 );
1183                 if (!(charid % 10))
1184                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1185                         (int)((depth * 2) + 14), "");
1186             }
1187         }
1188         PerlIO_printf( Perl_debug_log, "\n");
1189     }
1190 }
1191
1192 /*
1193   Dumps a fully constructed but uncompressed trie in table form.
1194   This is the normal DFA style state transition table, with a few
1195   twists to facilitate compression later.
1196   Used for debugging make_trie().
1197 */
1198 STATIC void
1199 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1200                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1201                           U32 depth)
1202 {
1203     U32 state;
1204     U16 charid;
1205     SV *sv=sv_newmortal();
1206     int colwidth= widecharmap ? 6 : 4;
1207     GET_RE_DEBUG_FLAGS_DECL;
1208
1209     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1210
1211     /*
1212        print out the table precompression so that we can do a visual check
1213        that they are identical.
1214      */
1215
1216     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1217
1218     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1219         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1220         if ( tmp ) {
1221             PerlIO_printf( Perl_debug_log, "%*s",
1222                 colwidth,
1223                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1224                             PL_colors[0], PL_colors[1],
1225                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1226                             PERL_PV_ESCAPE_FIRSTCHAR
1227                 )
1228             );
1229         }
1230     }
1231
1232     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1233
1234     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1235         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1236     }
1237
1238     PerlIO_printf( Perl_debug_log, "\n" );
1239
1240     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1241
1242         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1243             (int)depth * 2 + 2,"",
1244             (UV)TRIE_NODENUM( state ) );
1245
1246         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1247             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1248             if (v)
1249                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1250             else
1251                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1252         }
1253         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1254             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1255         } else {
1256             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1257             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1258         }
1259     }
1260 }
1261
1262 #endif
1263
1264
1265 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1266   startbranch: the first branch in the whole branch sequence
1267   first      : start branch of sequence of branch-exact nodes.
1268                May be the same as startbranch
1269   last       : Thing following the last branch.
1270                May be the same as tail.
1271   tail       : item following the branch sequence
1272   count      : words in the sequence
1273   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1274   depth      : indent depth
1275
1276 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1277
1278 A trie is an N'ary tree where the branches are determined by digital
1279 decomposition of the key. IE, at the root node you look up the 1st character and
1280 follow that branch repeat until you find the end of the branches. Nodes can be
1281 marked as "accepting" meaning they represent a complete word. Eg:
1282
1283   /he|she|his|hers/
1284
1285 would convert into the following structure. Numbers represent states, letters
1286 following numbers represent valid transitions on the letter from that state, if
1287 the number is in square brackets it represents an accepting state, otherwise it
1288 will be in parenthesis.
1289
1290       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1291       |    |
1292       |   (2)
1293       |    |
1294      (1)   +-i->(6)-+-s->[7]
1295       |
1296       +-s->(3)-+-h->(4)-+-e->[5]
1297
1298       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1299
1300 This shows that when matching against the string 'hers' we will begin at state 1
1301 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1302 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1303 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1304 single traverse. We store a mapping from accepting to state to which word was
1305 matched, and then when we have multiple possibilities we try to complete the
1306 rest of the regex in the order in which they occured in the alternation.
1307
1308 The only prior NFA like behaviour that would be changed by the TRIE support is
1309 the silent ignoring of duplicate alternations which are of the form:
1310
1311  / (DUPE|DUPE) X? (?{ ... }) Y /x
1312
1313 Thus EVAL blocks following a trie may be called a different number of times with
1314 and without the optimisation. With the optimisations dupes will be silently
1315 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1316 the following demonstrates:
1317
1318  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1319
1320 which prints out 'word' three times, but
1321
1322  'words'=~/(word|word|word)(?{ print $1 })S/
1323
1324 which doesnt print it out at all. This is due to other optimisations kicking in.
1325
1326 Example of what happens on a structural level:
1327
1328 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1329
1330    1: CURLYM[1] {1,32767}(18)
1331    5:   BRANCH(8)
1332    6:     EXACT <ac>(16)
1333    8:   BRANCH(11)
1334    9:     EXACT <ad>(16)
1335   11:   BRANCH(14)
1336   12:     EXACT <ab>(16)
1337   16:   SUCCEED(0)
1338   17:   NOTHING(18)
1339   18: END(0)
1340
1341 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1342 and should turn into:
1343
1344    1: CURLYM[1] {1,32767}(18)
1345    5:   TRIE(16)
1346         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1347           <ac>
1348           <ad>
1349           <ab>
1350   16:   SUCCEED(0)
1351   17:   NOTHING(18)
1352   18: END(0)
1353
1354 Cases where tail != last would be like /(?foo|bar)baz/:
1355
1356    1: BRANCH(4)
1357    2:   EXACT <foo>(8)
1358    4: BRANCH(7)
1359    5:   EXACT <bar>(8)
1360    7: TAIL(8)
1361    8: EXACT <baz>(10)
1362   10: END(0)
1363
1364 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1365 and would end up looking like:
1366
1367     1: TRIE(8)
1368       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1369         <foo>
1370         <bar>
1371    7: TAIL(8)
1372    8: EXACT <baz>(10)
1373   10: END(0)
1374
1375     d = uvuni_to_utf8_flags(d, uv, 0);
1376
1377 is the recommended Unicode-aware way of saying
1378
1379     *(d++) = uv;
1380 */
1381
1382 #define TRIE_STORE_REVCHAR(val)                                            \
1383     STMT_START {                                                           \
1384         if (UTF) {                                                         \
1385             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1386             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1387             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1388             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1389             SvPOK_on(zlopp);                                               \
1390             SvUTF8_on(zlopp);                                              \
1391             av_push(revcharmap, zlopp);                                    \
1392         } else {                                                           \
1393             char ooooff = (char)val;                                           \
1394             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1395         }                                                                  \
1396         } STMT_END
1397
1398 #define TRIE_READ_CHAR STMT_START {                                                     \
1399     wordlen++;                                                                          \
1400     if ( UTF ) {                                                                        \
1401         /* if it is UTF then it is either already folded, or does not need folding */   \
1402         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1403     }                                                                                   \
1404     else if (folder == PL_fold_latin1) {                                                \
1405         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1406         if ( foldlen > 0 ) {                                                            \
1407            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1408            foldlen -= len;                                                              \
1409            scan += len;                                                                 \
1410            len = 0;                                                                     \
1411         } else {                                                                        \
1412             len = 1;                                                                    \
1413             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1414             skiplen = UNISKIP(uvc);                                                     \
1415             foldlen -= skiplen;                                                         \
1416             scan = foldbuf + skiplen;                                                   \
1417         }                                                                               \
1418     } else {                                                                            \
1419         /* raw data, will be folded later if needed */                                  \
1420         uvc = (U32)*uc;                                                                 \
1421         len = 1;                                                                        \
1422     }                                                                                   \
1423 } STMT_END
1424
1425
1426
1427 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1428     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1429         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1430         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1431     }                                                           \
1432     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1433     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1434     TRIE_LIST_CUR( state )++;                                   \
1435 } STMT_END
1436
1437 #define TRIE_LIST_NEW(state) STMT_START {                       \
1438     Newxz( trie->states[ state ].trans.list,               \
1439         4, reg_trie_trans_le );                                 \
1440      TRIE_LIST_CUR( state ) = 1;                                \
1441      TRIE_LIST_LEN( state ) = 4;                                \
1442 } STMT_END
1443
1444 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1445     U16 dupe= trie->states[ state ].wordnum;                    \
1446     regnode * const noper_next = regnext( noper );              \
1447                                                                 \
1448     DEBUG_r({                                                   \
1449         /* store the word for dumping */                        \
1450         SV* tmp;                                                \
1451         if (OP(noper) != NOTHING)                               \
1452             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1453         else                                                    \
1454             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1455         av_push( trie_words, tmp );                             \
1456     });                                                         \
1457                                                                 \
1458     curword++;                                                  \
1459     trie->wordinfo[curword].prev   = 0;                         \
1460     trie->wordinfo[curword].len    = wordlen;                   \
1461     trie->wordinfo[curword].accept = state;                     \
1462                                                                 \
1463     if ( noper_next < tail ) {                                  \
1464         if (!trie->jump)                                        \
1465             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1466         trie->jump[curword] = (U16)(noper_next - convert);      \
1467         if (!jumper)                                            \
1468             jumper = noper_next;                                \
1469         if (!nextbranch)                                        \
1470             nextbranch= regnext(cur);                           \
1471     }                                                           \
1472                                                                 \
1473     if ( dupe ) {                                               \
1474         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1475         /* chain, so that when the bits of chain are later    */\
1476         /* linked together, the dups appear in the chain      */\
1477         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1478         trie->wordinfo[dupe].prev = curword;                    \
1479     } else {                                                    \
1480         /* we haven't inserted this word yet.                */ \
1481         trie->states[ state ].wordnum = curword;                \
1482     }                                                           \
1483 } STMT_END
1484
1485
1486 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1487      ( ( base + charid >=  ucharcount                                   \
1488          && base + charid < ubound                                      \
1489          && state == trie->trans[ base - ucharcount + charid ].check    \
1490          && trie->trans[ base - ucharcount + charid ].next )            \
1491            ? trie->trans[ base - ucharcount + charid ].next             \
1492            : ( state==1 ? special : 0 )                                 \
1493       )
1494
1495 #define MADE_TRIE       1
1496 #define MADE_JUMP_TRIE  2
1497 #define MADE_EXACT_TRIE 4
1498
1499 STATIC I32
1500 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1501 {
1502     dVAR;
1503     /* first pass, loop through and scan words */
1504     reg_trie_data *trie;
1505     HV *widecharmap = NULL;
1506     AV *revcharmap = newAV();
1507     regnode *cur;
1508     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1509     STRLEN len = 0;
1510     UV uvc = 0;
1511     U16 curword = 0;
1512     U32 next_alloc = 0;
1513     regnode *jumper = NULL;
1514     regnode *nextbranch = NULL;
1515     regnode *convert = NULL;
1516     U32 *prev_states; /* temp array mapping each state to previous one */
1517     /* we just use folder as a flag in utf8 */
1518     const U8 * folder = NULL;
1519
1520 #ifdef DEBUGGING
1521     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1522     AV *trie_words = NULL;
1523     /* along with revcharmap, this only used during construction but both are
1524      * useful during debugging so we store them in the struct when debugging.
1525      */
1526 #else
1527     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1528     STRLEN trie_charcount=0;
1529 #endif
1530     SV *re_trie_maxbuff;
1531     GET_RE_DEBUG_FLAGS_DECL;
1532
1533     PERL_ARGS_ASSERT_MAKE_TRIE;
1534 #ifndef DEBUGGING
1535     PERL_UNUSED_ARG(depth);
1536 #endif
1537
1538     switch (flags) {
1539         case EXACT: break;
1540         case EXACTFA:
1541         case EXACTFU_SS:
1542         case EXACTFU_TRICKYFOLD:
1543         case EXACTFU: folder = PL_fold_latin1; break;
1544         case EXACTF:  folder = PL_fold; break;
1545         case EXACTFL: folder = PL_fold_locale; break;
1546         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1547     }
1548
1549     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1550     trie->refcount = 1;
1551     trie->startstate = 1;
1552     trie->wordcount = word_count;
1553     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1554     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1555     if (flags == EXACT)
1556         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1557     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1558                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1559
1560     DEBUG_r({
1561         trie_words = newAV();
1562     });
1563
1564     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1565     if (!SvIOK(re_trie_maxbuff)) {
1566         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1567     }
1568     DEBUG_TRIE_COMPILE_r({
1569                 PerlIO_printf( Perl_debug_log,
1570                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1571                   (int)depth * 2 + 2, "",
1572                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1573                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1574                   (int)depth);
1575     });
1576
1577    /* Find the node we are going to overwrite */
1578     if ( first == startbranch && OP( last ) != BRANCH ) {
1579         /* whole branch chain */
1580         convert = first;
1581     } else {
1582         /* branch sub-chain */
1583         convert = NEXTOPER( first );
1584     }
1585
1586     /*  -- First loop and Setup --
1587
1588        We first traverse the branches and scan each word to determine if it
1589        contains widechars, and how many unique chars there are, this is
1590        important as we have to build a table with at least as many columns as we
1591        have unique chars.
1592
1593        We use an array of integers to represent the character codes 0..255
1594        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1595        native representation of the character value as the key and IV's for the
1596        coded index.
1597
1598        *TODO* If we keep track of how many times each character is used we can
1599        remap the columns so that the table compression later on is more
1600        efficient in terms of memory by ensuring the most common value is in the
1601        middle and the least common are on the outside.  IMO this would be better
1602        than a most to least common mapping as theres a decent chance the most
1603        common letter will share a node with the least common, meaning the node
1604        will not be compressible. With a middle is most common approach the worst
1605        case is when we have the least common nodes twice.
1606
1607      */
1608
1609     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1610         regnode *noper = NEXTOPER( cur );
1611         const U8 *uc = (U8*)STRING( noper );
1612         const U8 *e  = uc + STR_LEN( noper );
1613         STRLEN foldlen = 0;
1614         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1615         STRLEN skiplen = 0;
1616         const U8 *scan = (U8*)NULL;
1617         U32 wordlen      = 0;         /* required init */
1618         STRLEN chars = 0;
1619         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1620
1621         if (OP(noper) == NOTHING) {
1622             regnode *noper_next= regnext(noper);
1623             if (noper_next != tail && OP(noper_next) == flags) {
1624                 noper = noper_next;
1625                 uc= (U8*)STRING(noper);
1626                 e= uc + STR_LEN(noper);
1627                 trie->minlen= STR_LEN(noper);
1628             } else {
1629                 trie->minlen= 0;
1630                 continue;
1631             }
1632         }
1633
1634         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1635             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1636                                           regardless of encoding */
1637             if (OP( noper ) == EXACTFU_SS) {
1638                 /* false positives are ok, so just set this */
1639                 TRIE_BITMAP_SET(trie,0xDF);
1640             }
1641         }
1642         for ( ; uc < e ; uc += len ) {
1643             TRIE_CHARCOUNT(trie)++;
1644             TRIE_READ_CHAR;
1645             chars++;
1646             if ( uvc < 256 ) {
1647                 if ( folder ) {
1648                     U8 folded= folder[ (U8) uvc ];
1649                     if ( !trie->charmap[ folded ] ) {
1650                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1651                         TRIE_STORE_REVCHAR( folded );
1652                     }
1653                 }
1654                 if ( !trie->charmap[ uvc ] ) {
1655                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1656                     TRIE_STORE_REVCHAR( uvc );
1657                 }
1658                 if ( set_bit ) {
1659                     /* store the codepoint in the bitmap, and its folded
1660                      * equivalent. */
1661                     TRIE_BITMAP_SET(trie, uvc);
1662
1663                     /* store the folded codepoint */
1664                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1665
1666                     if ( !UTF ) {
1667                         /* store first byte of utf8 representation of
1668                            variant codepoints */
1669                         if (! UNI_IS_INVARIANT(uvc)) {
1670                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1671                         }
1672                     }
1673                     set_bit = 0; /* We've done our bit :-) */
1674                 }
1675             } else {
1676                 SV** svpp;
1677                 if ( !widecharmap )
1678                     widecharmap = newHV();
1679
1680                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1681
1682                 if ( !svpp )
1683                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1684
1685                 if ( !SvTRUE( *svpp ) ) {
1686                     sv_setiv( *svpp, ++trie->uniquecharcount );
1687                     TRIE_STORE_REVCHAR(uvc);
1688                 }
1689             }
1690         }
1691         if( cur == first ) {
1692             trie->minlen = chars;
1693             trie->maxlen = chars;
1694         } else if (chars < trie->minlen) {
1695             trie->minlen = chars;
1696         } else if (chars > trie->maxlen) {
1697             trie->maxlen = chars;
1698         }
1699         if (OP( noper ) == EXACTFU_SS) {
1700             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1701             if (trie->minlen > 1)
1702                 trie->minlen= 1;
1703         }
1704         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1705             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1706              *                - We assume that any such sequence might match a 2 byte string */
1707             if (trie->minlen > 2 )
1708                 trie->minlen= 2;
1709         }
1710
1711     } /* end first pass */
1712     DEBUG_TRIE_COMPILE_r(
1713         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1714                 (int)depth * 2 + 2,"",
1715                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1716                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1717                 (int)trie->minlen, (int)trie->maxlen )
1718     );
1719
1720     /*
1721         We now know what we are dealing with in terms of unique chars and
1722         string sizes so we can calculate how much memory a naive
1723         representation using a flat table  will take. If it's over a reasonable
1724         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1725         conservative but potentially much slower representation using an array
1726         of lists.
1727
1728         At the end we convert both representations into the same compressed
1729         form that will be used in regexec.c for matching with. The latter
1730         is a form that cannot be used to construct with but has memory
1731         properties similar to the list form and access properties similar
1732         to the table form making it both suitable for fast searches and
1733         small enough that its feasable to store for the duration of a program.
1734
1735         See the comment in the code where the compressed table is produced
1736         inplace from the flat tabe representation for an explanation of how
1737         the compression works.
1738
1739     */
1740
1741
1742     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1743     prev_states[1] = 0;
1744
1745     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1746         /*
1747             Second Pass -- Array Of Lists Representation
1748
1749             Each state will be represented by a list of charid:state records
1750             (reg_trie_trans_le) the first such element holds the CUR and LEN
1751             points of the allocated array. (See defines above).
1752
1753             We build the initial structure using the lists, and then convert
1754             it into the compressed table form which allows faster lookups
1755             (but cant be modified once converted).
1756         */
1757
1758         STRLEN transcount = 1;
1759
1760         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1761             "%*sCompiling trie using list compiler\n",
1762             (int)depth * 2 + 2, ""));
1763
1764         trie->states = (reg_trie_state *)
1765             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1766                                   sizeof(reg_trie_state) );
1767         TRIE_LIST_NEW(1);
1768         next_alloc = 2;
1769
1770         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1771
1772             regnode *noper   = NEXTOPER( cur );
1773             U8 *uc           = (U8*)STRING( noper );
1774             const U8 *e      = uc + STR_LEN( noper );
1775             U32 state        = 1;         /* required init */
1776             U16 charid       = 0;         /* sanity init */
1777             U8 *scan         = (U8*)NULL; /* sanity init */
1778             STRLEN foldlen   = 0;         /* required init */
1779             U32 wordlen      = 0;         /* required init */
1780             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1781             STRLEN skiplen   = 0;
1782
1783             if (OP(noper) == NOTHING) {
1784                 regnode *noper_next= regnext(noper);
1785                 if (noper_next != tail && OP(noper_next) == flags) {
1786                     noper = noper_next;
1787                     uc= (U8*)STRING(noper);
1788                     e= uc + STR_LEN(noper);
1789                 }
1790             }
1791
1792             if (OP(noper) != NOTHING) {
1793                 for ( ; uc < e ; uc += len ) {
1794
1795                     TRIE_READ_CHAR;
1796
1797                     if ( uvc < 256 ) {
1798                         charid = trie->charmap[ uvc ];
1799                     } else {
1800                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1801                         if ( !svpp ) {
1802                             charid = 0;
1803                         } else {
1804                             charid=(U16)SvIV( *svpp );
1805                         }
1806                     }
1807                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1808                     if ( charid ) {
1809
1810                         U16 check;
1811                         U32 newstate = 0;
1812
1813                         charid--;
1814                         if ( !trie->states[ state ].trans.list ) {
1815                             TRIE_LIST_NEW( state );
1816                         }
1817                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1818                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1819                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1820                                 break;
1821                             }
1822                         }
1823                         if ( ! newstate ) {
1824                             newstate = next_alloc++;
1825                             prev_states[newstate] = state;
1826                             TRIE_LIST_PUSH( state, charid, newstate );
1827                             transcount++;
1828                         }
1829                         state = newstate;
1830                     } else {
1831                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1832                     }
1833                 }
1834             }
1835             TRIE_HANDLE_WORD(state);
1836
1837         } /* end second pass */
1838
1839         /* next alloc is the NEXT state to be allocated */
1840         trie->statecount = next_alloc;
1841         trie->states = (reg_trie_state *)
1842             PerlMemShared_realloc( trie->states,
1843                                    next_alloc
1844                                    * sizeof(reg_trie_state) );
1845
1846         /* and now dump it out before we compress it */
1847         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1848                                                          revcharmap, next_alloc,
1849                                                          depth+1)
1850         );
1851
1852         trie->trans = (reg_trie_trans *)
1853             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1854         {
1855             U32 state;
1856             U32 tp = 0;
1857             U32 zp = 0;
1858
1859
1860             for( state=1 ; state < next_alloc ; state ++ ) {
1861                 U32 base=0;
1862
1863                 /*
1864                 DEBUG_TRIE_COMPILE_MORE_r(
1865                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1866                 );
1867                 */
1868
1869                 if (trie->states[state].trans.list) {
1870                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1871                     U16 maxid=minid;
1872                     U16 idx;
1873
1874                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1875                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1876                         if ( forid < minid ) {
1877                             minid=forid;
1878                         } else if ( forid > maxid ) {
1879                             maxid=forid;
1880                         }
1881                     }
1882                     if ( transcount < tp + maxid - minid + 1) {
1883                         transcount *= 2;
1884                         trie->trans = (reg_trie_trans *)
1885                             PerlMemShared_realloc( trie->trans,
1886                                                      transcount
1887                                                      * sizeof(reg_trie_trans) );
1888                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1889                     }
1890                     base = trie->uniquecharcount + tp - minid;
1891                     if ( maxid == minid ) {
1892                         U32 set = 0;
1893                         for ( ; zp < tp ; zp++ ) {
1894                             if ( ! trie->trans[ zp ].next ) {
1895                                 base = trie->uniquecharcount + zp - minid;
1896                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1897                                 trie->trans[ zp ].check = state;
1898                                 set = 1;
1899                                 break;
1900                             }
1901                         }
1902                         if ( !set ) {
1903                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1904                             trie->trans[ tp ].check = state;
1905                             tp++;
1906                             zp = tp;
1907                         }
1908                     } else {
1909                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1910                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1911                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1912                             trie->trans[ tid ].check = state;
1913                         }
1914                         tp += ( maxid - minid + 1 );
1915                     }
1916                     Safefree(trie->states[ state ].trans.list);
1917                 }
1918                 /*
1919                 DEBUG_TRIE_COMPILE_MORE_r(
1920                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1921                 );
1922                 */
1923                 trie->states[ state ].trans.base=base;
1924             }
1925             trie->lasttrans = tp + 1;
1926         }
1927     } else {
1928         /*
1929            Second Pass -- Flat Table Representation.
1930
1931            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1932            We know that we will need Charcount+1 trans at most to store the data
1933            (one row per char at worst case) So we preallocate both structures
1934            assuming worst case.
1935
1936            We then construct the trie using only the .next slots of the entry
1937            structs.
1938
1939            We use the .check field of the first entry of the node temporarily to
1940            make compression both faster and easier by keeping track of how many non
1941            zero fields are in the node.
1942
1943            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1944            transition.
1945
1946            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1947            number representing the first entry of the node, and state as a
1948            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1949            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1950            are 2 entrys per node. eg:
1951
1952              A B       A B
1953           1. 2 4    1. 3 7
1954           2. 0 3    3. 0 5
1955           3. 0 0    5. 0 0
1956           4. 0 0    7. 0 0
1957
1958            The table is internally in the right hand, idx form. However as we also
1959            have to deal with the states array which is indexed by nodenum we have to
1960            use TRIE_NODENUM() to convert.
1961
1962         */
1963         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1964             "%*sCompiling trie using table compiler\n",
1965             (int)depth * 2 + 2, ""));
1966
1967         trie->trans = (reg_trie_trans *)
1968             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1969                                   * trie->uniquecharcount + 1,
1970                                   sizeof(reg_trie_trans) );
1971         trie->states = (reg_trie_state *)
1972             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1973                                   sizeof(reg_trie_state) );
1974         next_alloc = trie->uniquecharcount + 1;
1975
1976
1977         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1978
1979             regnode *noper   = NEXTOPER( cur );
1980             const U8 *uc     = (U8*)STRING( noper );
1981             const U8 *e      = uc + STR_LEN( noper );
1982
1983             U32 state        = 1;         /* required init */
1984
1985             U16 charid       = 0;         /* sanity init */
1986             U32 accept_state = 0;         /* sanity init */
1987             U8 *scan         = (U8*)NULL; /* sanity init */
1988
1989             STRLEN foldlen   = 0;         /* required init */
1990             U32 wordlen      = 0;         /* required init */
1991             STRLEN skiplen   = 0;
1992             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1993
1994             if (OP(noper) == NOTHING) {
1995                 regnode *noper_next= regnext(noper);
1996                 if (noper_next != tail && OP(noper_next) == flags) {
1997                     noper = noper_next;
1998                     uc= (U8*)STRING(noper);
1999                     e= uc + STR_LEN(noper);
2000                 }
2001             }
2002
2003             if ( OP(noper) != NOTHING ) {
2004                 for ( ; uc < e ; uc += len ) {
2005
2006                     TRIE_READ_CHAR;
2007
2008                     if ( uvc < 256 ) {
2009                         charid = trie->charmap[ uvc ];
2010                     } else {
2011                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2012                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2013                     }
2014                     if ( charid ) {
2015                         charid--;
2016                         if ( !trie->trans[ state + charid ].next ) {
2017                             trie->trans[ state + charid ].next = next_alloc;
2018                             trie->trans[ state ].check++;
2019                             prev_states[TRIE_NODENUM(next_alloc)]
2020                                     = TRIE_NODENUM(state);
2021                             next_alloc += trie->uniquecharcount;
2022                         }
2023                         state = trie->trans[ state + charid ].next;
2024                     } else {
2025                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2026                     }
2027                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2028                 }
2029             }
2030             accept_state = TRIE_NODENUM( state );
2031             TRIE_HANDLE_WORD(accept_state);
2032
2033         } /* end second pass */
2034
2035         /* and now dump it out before we compress it */
2036         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2037                                                           revcharmap,
2038                                                           next_alloc, depth+1));
2039
2040         {
2041         /*
2042            * Inplace compress the table.*
2043
2044            For sparse data sets the table constructed by the trie algorithm will
2045            be mostly 0/FAIL transitions or to put it another way mostly empty.
2046            (Note that leaf nodes will not contain any transitions.)
2047
2048            This algorithm compresses the tables by eliminating most such
2049            transitions, at the cost of a modest bit of extra work during lookup:
2050
2051            - Each states[] entry contains a .base field which indicates the
2052            index in the state[] array wheres its transition data is stored.
2053
2054            - If .base is 0 there are no valid transitions from that node.
2055
2056            - If .base is nonzero then charid is added to it to find an entry in
2057            the trans array.
2058
2059            -If trans[states[state].base+charid].check!=state then the
2060            transition is taken to be a 0/Fail transition. Thus if there are fail
2061            transitions at the front of the node then the .base offset will point
2062            somewhere inside the previous nodes data (or maybe even into a node
2063            even earlier), but the .check field determines if the transition is
2064            valid.
2065
2066            XXX - wrong maybe?
2067            The following process inplace converts the table to the compressed
2068            table: We first do not compress the root node 1,and mark all its
2069            .check pointers as 1 and set its .base pointer as 1 as well. This
2070            allows us to do a DFA construction from the compressed table later,
2071            and ensures that any .base pointers we calculate later are greater
2072            than 0.
2073
2074            - We set 'pos' to indicate the first entry of the second node.
2075
2076            - We then iterate over the columns of the node, finding the first and
2077            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2078            and set the .check pointers accordingly, and advance pos
2079            appropriately and repreat for the next node. Note that when we copy
2080            the next pointers we have to convert them from the original
2081            NODEIDX form to NODENUM form as the former is not valid post
2082            compression.
2083
2084            - If a node has no transitions used we mark its base as 0 and do not
2085            advance the pos pointer.
2086
2087            - If a node only has one transition we use a second pointer into the
2088            structure to fill in allocated fail transitions from other states.
2089            This pointer is independent of the main pointer and scans forward
2090            looking for null transitions that are allocated to a state. When it
2091            finds one it writes the single transition into the "hole".  If the
2092            pointer doesnt find one the single transition is appended as normal.
2093
2094            - Once compressed we can Renew/realloc the structures to release the
2095            excess space.
2096
2097            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2098            specifically Fig 3.47 and the associated pseudocode.
2099
2100            demq
2101         */
2102         const U32 laststate = TRIE_NODENUM( next_alloc );
2103         U32 state, charid;
2104         U32 pos = 0, zp=0;
2105         trie->statecount = laststate;
2106
2107         for ( state = 1 ; state < laststate ; state++ ) {
2108             U8 flag = 0;
2109             const U32 stateidx = TRIE_NODEIDX( state );
2110             const U32 o_used = trie->trans[ stateidx ].check;
2111             U32 used = trie->trans[ stateidx ].check;
2112             trie->trans[ stateidx ].check = 0;
2113
2114             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2115                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2116                     if ( trie->trans[ stateidx + charid ].next ) {
2117                         if (o_used == 1) {
2118                             for ( ; zp < pos ; zp++ ) {
2119                                 if ( ! trie->trans[ zp ].next ) {
2120                                     break;
2121                                 }
2122                             }
2123                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2124                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2125                             trie->trans[ zp ].check = state;
2126                             if ( ++zp > pos ) pos = zp;
2127                             break;
2128                         }
2129                         used--;
2130                     }
2131                     if ( !flag ) {
2132                         flag = 1;
2133                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2134                     }
2135                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2136                     trie->trans[ pos ].check = state;
2137                     pos++;
2138                 }
2139             }
2140         }
2141         trie->lasttrans = pos + 1;
2142         trie->states = (reg_trie_state *)
2143             PerlMemShared_realloc( trie->states, laststate
2144                                    * sizeof(reg_trie_state) );
2145         DEBUG_TRIE_COMPILE_MORE_r(
2146                 PerlIO_printf( Perl_debug_log,
2147                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2148                     (int)depth * 2 + 2,"",
2149                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2150                     (IV)next_alloc,
2151                     (IV)pos,
2152                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2153             );
2154
2155         } /* end table compress */
2156     }
2157     DEBUG_TRIE_COMPILE_MORE_r(
2158             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2159                 (int)depth * 2 + 2, "",
2160                 (UV)trie->statecount,
2161                 (UV)trie->lasttrans)
2162     );
2163     /* resize the trans array to remove unused space */
2164     trie->trans = (reg_trie_trans *)
2165         PerlMemShared_realloc( trie->trans, trie->lasttrans
2166                                * sizeof(reg_trie_trans) );
2167
2168     {   /* Modify the program and insert the new TRIE node */
2169         U8 nodetype =(U8)(flags & 0xFF);
2170         char *str=NULL;
2171
2172 #ifdef DEBUGGING
2173         regnode *optimize = NULL;
2174 #ifdef RE_TRACK_PATTERN_OFFSETS
2175
2176         U32 mjd_offset = 0;
2177         U32 mjd_nodelen = 0;
2178 #endif /* RE_TRACK_PATTERN_OFFSETS */
2179 #endif /* DEBUGGING */
2180         /*
2181            This means we convert either the first branch or the first Exact,
2182            depending on whether the thing following (in 'last') is a branch
2183            or not and whther first is the startbranch (ie is it a sub part of
2184            the alternation or is it the whole thing.)
2185            Assuming its a sub part we convert the EXACT otherwise we convert
2186            the whole branch sequence, including the first.
2187          */
2188         /* Find the node we are going to overwrite */
2189         if ( first != startbranch || OP( last ) == BRANCH ) {
2190             /* branch sub-chain */
2191             NEXT_OFF( first ) = (U16)(last - first);
2192 #ifdef RE_TRACK_PATTERN_OFFSETS
2193             DEBUG_r({
2194                 mjd_offset= Node_Offset((convert));
2195                 mjd_nodelen= Node_Length((convert));
2196             });
2197 #endif
2198             /* whole branch chain */
2199         }
2200 #ifdef RE_TRACK_PATTERN_OFFSETS
2201         else {
2202             DEBUG_r({
2203                 const  regnode *nop = NEXTOPER( convert );
2204                 mjd_offset= Node_Offset((nop));
2205                 mjd_nodelen= Node_Length((nop));
2206             });
2207         }
2208         DEBUG_OPTIMISE_r(
2209             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2210                 (int)depth * 2 + 2, "",
2211                 (UV)mjd_offset, (UV)mjd_nodelen)
2212         );
2213 #endif
2214         /* But first we check to see if there is a common prefix we can
2215            split out as an EXACT and put in front of the TRIE node.  */
2216         trie->startstate= 1;
2217         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2218             U32 state;
2219             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2220                 U32 ofs = 0;
2221                 I32 idx = -1;
2222                 U32 count = 0;
2223                 const U32 base = trie->states[ state ].trans.base;
2224
2225                 if ( trie->states[state].wordnum )
2226                         count = 1;
2227
2228                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2229                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2230                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2231                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2232                     {
2233                         if ( ++count > 1 ) {
2234                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2235                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2236                             if ( state == 1 ) break;
2237                             if ( count == 2 ) {
2238                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2239                                 DEBUG_OPTIMISE_r(
2240                                     PerlIO_printf(Perl_debug_log,
2241                                         "%*sNew Start State=%"UVuf" Class: [",
2242                                         (int)depth * 2 + 2, "",
2243                                         (UV)state));
2244                                 if (idx >= 0) {
2245                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2246                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2247
2248                                     TRIE_BITMAP_SET(trie,*ch);
2249                                     if ( folder )
2250                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2251                                     DEBUG_OPTIMISE_r(
2252                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2253                                     );
2254                                 }
2255                             }
2256                             TRIE_BITMAP_SET(trie,*ch);
2257                             if ( folder )
2258                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2259                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2260                         }
2261                         idx = ofs;
2262                     }
2263                 }
2264                 if ( count == 1 ) {
2265                     SV **tmp = av_fetch( revcharmap, idx, 0);
2266                     STRLEN len;
2267                     char *ch = SvPV( *tmp, len );
2268                     DEBUG_OPTIMISE_r({
2269                         SV *sv=sv_newmortal();
2270                         PerlIO_printf( Perl_debug_log,
2271                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2272                             (int)depth * 2 + 2, "",
2273                             (UV)state, (UV)idx,
2274                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2275                                 PL_colors[0], PL_colors[1],
2276                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2277                                 PERL_PV_ESCAPE_FIRSTCHAR
2278                             )
2279                         );
2280                     });
2281                     if ( state==1 ) {
2282                         OP( convert ) = nodetype;
2283                         str=STRING(convert);
2284                         STR_LEN(convert)=0;
2285                     }
2286                     STR_LEN(convert) += len;
2287                     while (len--)
2288                         *str++ = *ch++;
2289                 } else {
2290 #ifdef DEBUGGING
2291                     if (state>1)
2292                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2293 #endif
2294                     break;
2295                 }
2296             }
2297             trie->prefixlen = (state-1);
2298             if (str) {
2299                 regnode *n = convert+NODE_SZ_STR(convert);
2300                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2301                 trie->startstate = state;
2302                 trie->minlen -= (state - 1);
2303                 trie->maxlen -= (state - 1);
2304 #ifdef DEBUGGING
2305                /* At least the UNICOS C compiler choked on this
2306                 * being argument to DEBUG_r(), so let's just have
2307                 * it right here. */
2308                if (
2309 #ifdef PERL_EXT_RE_BUILD
2310                    1
2311 #else
2312                    DEBUG_r_TEST
2313 #endif
2314                    ) {
2315                    regnode *fix = convert;
2316                    U32 word = trie->wordcount;
2317                    mjd_nodelen++;
2318                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2319                    while( ++fix < n ) {
2320                        Set_Node_Offset_Length(fix, 0, 0);
2321                    }
2322                    while (word--) {
2323                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2324                        if (tmp) {
2325                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2326                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2327                            else
2328                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2329                        }
2330                    }
2331                }
2332 #endif
2333                 if (trie->maxlen) {
2334                     convert = n;
2335                 } else {
2336                     NEXT_OFF(convert) = (U16)(tail - convert);
2337                     DEBUG_r(optimize= n);
2338                 }
2339             }
2340         }
2341         if (!jumper)
2342             jumper = last;
2343         if ( trie->maxlen ) {
2344             NEXT_OFF( convert ) = (U16)(tail - convert);
2345             ARG_SET( convert, data_slot );
2346             /* Store the offset to the first unabsorbed branch in
2347                jump[0], which is otherwise unused by the jump logic.
2348                We use this when dumping a trie and during optimisation. */
2349             if (trie->jump)
2350                 trie->jump[0] = (U16)(nextbranch - convert);
2351
2352             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2353              *   and there is a bitmap
2354              *   and the first "jump target" node we found leaves enough room
2355              * then convert the TRIE node into a TRIEC node, with the bitmap
2356              * embedded inline in the opcode - this is hypothetically faster.
2357              */
2358             if ( !trie->states[trie->startstate].wordnum
2359                  && trie->bitmap
2360                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2361             {
2362                 OP( convert ) = TRIEC;
2363                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2364                 PerlMemShared_free(trie->bitmap);
2365                 trie->bitmap= NULL;
2366             } else
2367                 OP( convert ) = TRIE;
2368
2369             /* store the type in the flags */
2370             convert->flags = nodetype;
2371             DEBUG_r({
2372             optimize = convert
2373                       + NODE_STEP_REGNODE
2374                       + regarglen[ OP( convert ) ];
2375             });
2376             /* XXX We really should free up the resource in trie now,
2377                    as we won't use them - (which resources?) dmq */
2378         }
2379         /* needed for dumping*/
2380         DEBUG_r(if (optimize) {
2381             regnode *opt = convert;
2382
2383             while ( ++opt < optimize) {
2384                 Set_Node_Offset_Length(opt,0,0);
2385             }
2386             /*
2387                 Try to clean up some of the debris left after the
2388                 optimisation.
2389              */
2390             while( optimize < jumper ) {
2391                 mjd_nodelen += Node_Length((optimize));
2392                 OP( optimize ) = OPTIMIZED;
2393                 Set_Node_Offset_Length(optimize,0,0);
2394                 optimize++;
2395             }
2396             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2397         });
2398     } /* end node insert */
2399
2400     /*  Finish populating the prev field of the wordinfo array.  Walk back
2401      *  from each accept state until we find another accept state, and if
2402      *  so, point the first word's .prev field at the second word. If the
2403      *  second already has a .prev field set, stop now. This will be the
2404      *  case either if we've already processed that word's accept state,
2405      *  or that state had multiple words, and the overspill words were
2406      *  already linked up earlier.
2407      */
2408     {
2409         U16 word;
2410         U32 state;
2411         U16 prev;
2412
2413         for (word=1; word <= trie->wordcount; word++) {
2414             prev = 0;
2415             if (trie->wordinfo[word].prev)
2416                 continue;
2417             state = trie->wordinfo[word].accept;
2418             while (state) {
2419                 state = prev_states[state];
2420                 if (!state)
2421                     break;
2422                 prev = trie->states[state].wordnum;
2423                 if (prev)
2424                     break;
2425             }
2426             trie->wordinfo[word].prev = prev;
2427         }
2428         Safefree(prev_states);
2429     }
2430
2431
2432     /* and now dump out the compressed format */
2433     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2434
2435     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2436 #ifdef DEBUGGING
2437     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2438     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2439 #else
2440     SvREFCNT_dec(revcharmap);
2441 #endif
2442     return trie->jump
2443            ? MADE_JUMP_TRIE
2444            : trie->startstate>1
2445              ? MADE_EXACT_TRIE
2446              : MADE_TRIE;
2447 }
2448
2449 STATIC void
2450 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2451 {
2452 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2453
2454    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2455    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2456    ISBN 0-201-10088-6
2457
2458    We find the fail state for each state in the trie, this state is the longest proper
2459    suffix of the current state's 'word' that is also a proper prefix of another word in our
2460    trie. State 1 represents the word '' and is thus the default fail state. This allows
2461    the DFA not to have to restart after its tried and failed a word at a given point, it
2462    simply continues as though it had been matching the other word in the first place.
2463    Consider
2464       'abcdgu'=~/abcdefg|cdgu/
2465    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2466    fail, which would bring us to the state representing 'd' in the second word where we would
2467    try 'g' and succeed, proceeding to match 'cdgu'.
2468  */
2469  /* add a fail transition */
2470     const U32 trie_offset = ARG(source);
2471     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2472     U32 *q;
2473     const U32 ucharcount = trie->uniquecharcount;
2474     const U32 numstates = trie->statecount;
2475     const U32 ubound = trie->lasttrans + ucharcount;
2476     U32 q_read = 0;
2477     U32 q_write = 0;
2478     U32 charid;
2479     U32 base = trie->states[ 1 ].trans.base;
2480     U32 *fail;
2481     reg_ac_data *aho;
2482     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2483     GET_RE_DEBUG_FLAGS_DECL;
2484
2485     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2486 #ifndef DEBUGGING
2487     PERL_UNUSED_ARG(depth);
2488 #endif
2489
2490
2491     ARG_SET( stclass, data_slot );
2492     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2493     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2494     aho->trie=trie_offset;
2495     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2496     Copy( trie->states, aho->states, numstates, reg_trie_state );
2497     Newxz( q, numstates, U32);
2498     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2499     aho->refcount = 1;
2500     fail = aho->fail;
2501     /* initialize fail[0..1] to be 1 so that we always have
2502        a valid final fail state */
2503     fail[ 0 ] = fail[ 1 ] = 1;
2504
2505     for ( charid = 0; charid < ucharcount ; charid++ ) {
2506         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2507         if ( newstate ) {
2508             q[ q_write ] = newstate;
2509             /* set to point at the root */
2510             fail[ q[ q_write++ ] ]=1;
2511         }
2512     }
2513     while ( q_read < q_write) {
2514         const U32 cur = q[ q_read++ % numstates ];
2515         base = trie->states[ cur ].trans.base;
2516
2517         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2518             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2519             if (ch_state) {
2520                 U32 fail_state = cur;
2521                 U32 fail_base;
2522                 do {
2523                     fail_state = fail[ fail_state ];
2524                     fail_base = aho->states[ fail_state ].trans.base;
2525                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2526
2527                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2528                 fail[ ch_state ] = fail_state;
2529                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2530                 {
2531                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2532                 }
2533                 q[ q_write++ % numstates] = ch_state;
2534             }
2535         }
2536     }
2537     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2538        when we fail in state 1, this allows us to use the
2539        charclass scan to find a valid start char. This is based on the principle
2540        that theres a good chance the string being searched contains lots of stuff
2541        that cant be a start char.
2542      */
2543     fail[ 0 ] = fail[ 1 ] = 0;
2544     DEBUG_TRIE_COMPILE_r({
2545         PerlIO_printf(Perl_debug_log,
2546                       "%*sStclass Failtable (%"UVuf" states): 0",
2547                       (int)(depth * 2), "", (UV)numstates
2548         );
2549         for( q_read=1; q_read<numstates; q_read++ ) {
2550             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2551         }
2552         PerlIO_printf(Perl_debug_log, "\n");
2553     });
2554     Safefree(q);
2555     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2556 }
2557
2558
2559 /*
2560  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2561  * These need to be revisited when a newer toolchain becomes available.
2562  */
2563 #if defined(__sparc64__) && defined(__GNUC__)
2564 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2565 #       undef  SPARC64_GCC_WORKAROUND
2566 #       define SPARC64_GCC_WORKAROUND 1
2567 #   endif
2568 #endif
2569
2570 #define DEBUG_PEEP(str,scan,depth) \
2571     DEBUG_OPTIMISE_r({if (scan){ \
2572        SV * const mysv=sv_newmortal(); \
2573        regnode *Next = regnext(scan); \
2574        regprop(RExC_rx, mysv, scan); \
2575        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2576        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2577        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2578    }});
2579
2580
2581 /* The below joins as many adjacent EXACTish nodes as possible into a single
2582  * one, and looks for problematic sequences of characters whose folds vs.
2583  * non-folds have sufficiently different lengths, that the optimizer would be
2584  * fooled into rejecting legitimate matches of them, and the trie construction
2585  * code needs to handle specially.  The joining is only done if:
2586  * 1) there is room in the current conglomerated node to entirely contain the
2587  *    next one.
2588  * 2) they are the exact same node type
2589  *
2590  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2591  * these get optimized out
2592  *
2593  * If there are problematic code sequences, *min_subtract is set to the delta
2594  * that the minimum size of the node can be less than its actual size.  And,
2595  * the node type of the result is changed to reflect that it contains these
2596  * sequences.
2597  *
2598  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2599  * and contains LATIN SMALL LETTER SHARP S
2600  *
2601  * This is as good a place as any to discuss the design of handling these
2602  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2603  * are three code points currently in Unicode whose folded lengths differ so
2604  * much from the un-folded lengths that it causes problems for the optimizer
2605  * and trie construction.  Why only these are problematic, and not others where
2606  * lengths also differ is something I (khw) do not understand.  New versions of
2607  * Unicode might add more such code points.  Hopefully the logic in
2608  * fold_grind.t that figures out what to test (in part by verifying that each
2609  * size-combination gets tested) will catch any that do come along, so they can
2610  * be added to the special handling below.  The chances of new ones are
2611  * actually rather small, as most, if not all, of the world's scripts that have
2612  * casefolding have already been encoded by Unicode.  Also, a number of
2613  * Unicode's decisions were made to allow compatibility with pre-existing
2614  * standards, and almost all of those have already been dealt with.  These
2615  * would otherwise be the most likely candidates for generating further tricky
2616  * sequences.  In other words, Unicode by itself is unlikely to add new ones
2617  * unless it is for compatibility with pre-existing standards, and there aren't
2618  * many of those left.
2619  *
2620  * The previous designs for dealing with these involved assigning a special
2621  * node for them.  This approach doesn't work, as evidenced by this example:
2622  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2623  * Both these fold to "sss", but if the pattern is parsed to create a node
2624  * that would match just the \xDF, it won't be able to handle the case where a
2625  * successful match would have to cross the node's boundary.  The new approach
2626  * that hopefully generally solves the problem generates an EXACTFU_SS node
2627  * that is "sss".
2628  *
2629  * There are a number of components to the approach (a lot of work for just
2630  * three code points!):
2631  * 1)   This routine examines each EXACTFish node that could contain the
2632  *      problematic sequences.  It returns in *min_subtract how much to
2633  *      subtract from the the actual length of the string to get a real minimum
2634  *      for one that could match it.  This number is usually 0 except for the
2635  *      problematic sequences.  This delta is used by the caller to adjust the
2636  *      min length of the match, and the delta between min and max, so that the
2637  *      optimizer doesn't reject these possibilities based on size constraints.
2638  * 2)   These sequences require special handling by the trie code, so this code
2639  *      changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
2640  *      EXACTFU_SS.
2641  * 3)   This is sufficient for the two Greek sequences (described below), but
2642  *      the one involving the Sharp s (\xDF) needs more.  The node type
2643  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2644  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2645  *      case where there is a possible fold length change.  That means that a
2646  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2647  *      itself with length changes, and so can be processed faster.  regexec.c
2648  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2649  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2650  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2651  *      fold of the MICRO SIGN requires UTF-8.  Also what EXACTF and EXACTFL
2652  *      nodes fold to isn't known until runtime.  The fold possibilities for
2653  *      the non-UTF8 patterns are quite simple, except for the sharp s.  All
2654  *      the ones that don't involve a UTF-8 target string are members of a
2655  *      fold-pair, and arrays are set up for all of them so that the other
2656  *      member of the pair can be found quickly.  Code elsewhere in this file
2657  *      makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
2658  *      if the pattern isn't UTF-8.  This avoids the issues described in the
2659  *      next item.
2660  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2661  *      'ss' or not is not knowable at compile time.  It will match iff the
2662  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2663  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2664  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2665  *      described in item 3).  An assumption that the optimizer part of
2666  *      regexec.c (probably unwittingly) makes is that a character in the
2667  *      pattern corresponds to at most a single character in the target string.
2668  *      (And I do mean character, and not byte here, unlike other parts of the
2669  *      documentation that have never been updated to account for multibyte
2670  *      Unicode.)  This assumption is wrong only in this case, as all other
2671  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2672  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2673  *      reluctant to try to change this assumption, so instead the code punts.
2674  *      This routine examines EXACTF nodes for the sharp s, and returns a
2675  *      boolean indicating whether or not the node is an EXACTF node that
2676  *      contains a sharp s.  When it is true, the caller sets a flag that later
2677  *      causes the optimizer in this file to not set values for the floating
2678  *      and fixed string lengths, and thus avoids the optimizer code in
2679  *      regexec.c that makes the invalid assumption.  Thus, there is no
2680  *      optimization based on string lengths for EXACTF nodes that contain the
2681  *      sharp s.  This only happens for /id rules (which means the pattern
2682  *      isn't in UTF-8).
2683  */
2684
2685 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2686     if (PL_regkind[OP(scan)] == EXACT) \
2687         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2688
2689 STATIC U32
2690 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2691     /* Merge several consecutive EXACTish nodes into one. */
2692     regnode *n = regnext(scan);
2693     U32 stringok = 1;
2694     regnode *next = scan + NODE_SZ_STR(scan);
2695     U32 merged = 0;
2696     U32 stopnow = 0;
2697 #ifdef DEBUGGING
2698     regnode *stop = scan;
2699     GET_RE_DEBUG_FLAGS_DECL;
2700 #else
2701     PERL_UNUSED_ARG(depth);
2702 #endif
2703
2704     PERL_ARGS_ASSERT_JOIN_EXACT;
2705 #ifndef EXPERIMENTAL_INPLACESCAN
2706     PERL_UNUSED_ARG(flags);
2707     PERL_UNUSED_ARG(val);
2708 #endif
2709     DEBUG_PEEP("join",scan,depth);
2710
2711     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2712      * EXACT ones that are mergeable to the current one. */
2713     while (n
2714            && (PL_regkind[OP(n)] == NOTHING
2715                || (stringok && OP(n) == OP(scan)))
2716            && NEXT_OFF(n)
2717            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2718     {
2719
2720         if (OP(n) == TAIL || n > next)
2721             stringok = 0;
2722         if (PL_regkind[OP(n)] == NOTHING) {
2723             DEBUG_PEEP("skip:",n,depth);
2724             NEXT_OFF(scan) += NEXT_OFF(n);
2725             next = n + NODE_STEP_REGNODE;
2726 #ifdef DEBUGGING
2727             if (stringok)
2728                 stop = n;
2729 #endif
2730             n = regnext(n);
2731         }
2732         else if (stringok) {
2733             const unsigned int oldl = STR_LEN(scan);
2734             regnode * const nnext = regnext(n);
2735
2736             /* XXX I (khw) kind of doubt that this works on platforms where
2737              * U8_MAX is above 255 because of lots of other assumptions */
2738             if (oldl + STR_LEN(n) > U8_MAX)
2739                 break;
2740
2741             DEBUG_PEEP("merg",n,depth);
2742             merged++;
2743
2744             NEXT_OFF(scan) += NEXT_OFF(n);
2745             STR_LEN(scan) += STR_LEN(n);
2746             next = n + NODE_SZ_STR(n);
2747             /* Now we can overwrite *n : */
2748             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2749 #ifdef DEBUGGING
2750             stop = next - 1;
2751 #endif
2752             n = nnext;
2753             if (stopnow) break;
2754         }
2755
2756 #ifdef EXPERIMENTAL_INPLACESCAN
2757         if (flags && !NEXT_OFF(n)) {
2758             DEBUG_PEEP("atch", val, depth);
2759             if (reg_off_by_arg[OP(n)]) {
2760                 ARG_SET(n, val - n);
2761             }
2762             else {
2763                 NEXT_OFF(n) = val - n;
2764             }
2765             stopnow = 1;
2766         }
2767 #endif
2768     }
2769
2770     *min_subtract = 0;
2771     *has_exactf_sharp_s = FALSE;
2772
2773     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2774      * can now analyze for sequences of problematic code points.  (Prior to
2775      * this final joining, sequences could have been split over boundaries, and
2776      * hence missed).  The sequences only happen in folding, hence for any
2777      * non-EXACT EXACTish node */
2778     if (OP(scan) != EXACT) {
2779         U8 *s;
2780         U8 * s0 = (U8*) STRING(scan);
2781         U8 * const s_end = s0 + STR_LEN(scan);
2782
2783         /* The below is perhaps overboard, but this allows us to save a test
2784          * each time through the loop at the expense of a mask.  This is
2785          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2786          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2787          * This uses an exclusive 'or' to find that bit and then inverts it to
2788          * form a mask, with just a single 0, in the bit position where 'S' and
2789          * 's' differ. */
2790         const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2791         const U8 s_masked = 's' & S_or_s_mask;
2792
2793         /* One pass is made over the node's string looking for all the
2794          * possibilities.  to avoid some tests in the loop, there are two main
2795          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2796          * non-UTF-8 */
2797         if (UTF) {
2798
2799             /* There are two problematic Greek code points in Unicode
2800              * casefolding
2801              *
2802              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2803              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2804              *
2805              * which casefold to
2806              *
2807              * Unicode                      UTF-8
2808              *
2809              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2810              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2811              *
2812              * This means that in case-insensitive matching (or "loose
2813              * matching", as Unicode calls it), an EXACTF of length six (the
2814              * UTF-8 encoded byte length of the above casefolded versions) can
2815              * match a target string of length two (the byte length of UTF-8
2816              * encoded U+0390 or U+03B0).  This would rather mess up the
2817              * minimum length computation.  (there are other code points that
2818              * also fold to these two sequences, but the delta is smaller)
2819              *
2820              * If these sequences are found, the minimum length is decreased by
2821              * four (six minus two).
2822              *
2823              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2824              * LETTER SHARP S.  We decrease the min length by 1 for each
2825              * occurrence of 'ss' found */
2826
2827 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2828 #           define U390_first_byte 0xb4
2829             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2830 #           define U3B0_first_byte 0xb5
2831             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2832 #else
2833 #           define U390_first_byte 0xce
2834             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2835 #           define U3B0_first_byte 0xcf
2836             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2837 #endif
2838             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2839                                                  yields a net of 0 */
2840             /* Examine the string for one of the problematic sequences */
2841             for (s = s0;
2842                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2843                                  * sequence we are looking for is 2 */
2844                  s += UTF8SKIP(s))
2845             {
2846
2847                 /* Look for the first byte in each problematic sequence */
2848                 switch (*s) {
2849                     /* We don't have to worry about other things that fold to
2850                      * 's' (such as the long s, U+017F), as all above-latin1
2851                      * code points have been pre-folded */
2852                     case 's':
2853                     case 'S':
2854
2855                         /* Current character is an 's' or 'S'.  If next one is
2856                          * as well, we have the dreaded sequence */
2857                         if (((*(s+1) & S_or_s_mask) == s_masked)
2858                             /* These two node types don't have special handling
2859                              * for 'ss' */
2860                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2861                         {
2862                             *min_subtract += 1;
2863                             OP(scan) = EXACTFU_SS;
2864                             s++;    /* No need to look at this character again */
2865                         }
2866                         break;
2867
2868                     case U390_first_byte:
2869                         if (s_end - s >= len
2870
2871                             /* The 1's are because are skipping comparing the
2872                              * first byte */
2873                             && memEQ(s + 1, U390_tail, len - 1))
2874                         {
2875                             goto greek_sequence;
2876                         }
2877                         break;
2878
2879                     case U3B0_first_byte:
2880                         if (! (s_end - s >= len
2881                                && memEQ(s + 1, U3B0_tail, len - 1)))
2882                         {
2883                             break;
2884                         }
2885                       greek_sequence:
2886                         *min_subtract += 4;
2887
2888                         /* This requires special handling by trie's, so change
2889                          * the node type to indicate this.  If EXACTFA and
2890                          * EXACTFL were ever to be handled by trie's, this
2891                          * would have to be changed.  If this node has already
2892                          * been changed to EXACTFU_SS in this loop, leave it as
2893                          * is.  (I (khw) think it doesn't matter in regexec.c
2894                          * for UTF patterns, but no need to change it */
2895                         if (OP(scan) == EXACTFU) {
2896                             OP(scan) = EXACTFU_TRICKYFOLD;
2897                         }
2898                         s += 6; /* We already know what this sequence is.  Skip
2899                                    the rest of it */
2900                         break;
2901                 }
2902             }
2903         }
2904         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2905
2906             /* Here, the pattern is not UTF-8.  We need to look only for the
2907              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2908              * in the final position.  Otherwise we can stop looking 1 byte
2909              * earlier because have to find both the first and second 's' */
2910             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2911
2912             for (s = s0; s < upper; s++) {
2913                 switch (*s) {
2914                     case 'S':
2915                     case 's':
2916                         if (s_end - s > 1
2917                             && ((*(s+1) & S_or_s_mask) == s_masked))
2918                         {
2919                             *min_subtract += 1;
2920
2921                             /* EXACTF nodes need to know that the minimum
2922                              * length changed so that a sharp s in the string
2923                              * can match this ss in the pattern, but they
2924                              * remain EXACTF nodes, as they won't match this
2925                              * unless the target string is is UTF-8, which we
2926                              * don't know until runtime */
2927                             if (OP(scan) != EXACTF) {
2928                                 OP(scan) = EXACTFU_SS;
2929                             }
2930                             s++;
2931                         }
2932                         break;
2933                     case LATIN_SMALL_LETTER_SHARP_S:
2934                         if (OP(scan) == EXACTF) {
2935                             *has_exactf_sharp_s = TRUE;
2936                         }
2937                         break;
2938                 }
2939             }
2940         }
2941     }
2942
2943 #ifdef DEBUGGING
2944     /* Allow dumping but overwriting the collection of skipped
2945      * ops and/or strings with fake optimized ops */
2946     n = scan + NODE_SZ_STR(scan);
2947     while (n <= stop) {
2948         OP(n) = OPTIMIZED;
2949         FLAGS(n) = 0;
2950         NEXT_OFF(n) = 0;
2951         n++;
2952     }
2953 #endif
2954     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2955     return stopnow;
2956 }
2957
2958 /* REx optimizer.  Converts nodes into quicker variants "in place".
2959    Finds fixed substrings.  */
2960
2961 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2962    to the position after last scanned or to NULL. */
2963
2964 #define INIT_AND_WITHP \
2965     assert(!and_withp); \
2966     Newx(and_withp,1,struct regnode_charclass_class); \
2967     SAVEFREEPV(and_withp)
2968
2969 /* this is a chain of data about sub patterns we are processing that
2970    need to be handled separately/specially in study_chunk. Its so
2971    we can simulate recursion without losing state.  */
2972 struct scan_frame;
2973 typedef struct scan_frame {
2974     regnode *last;  /* last node to process in this frame */
2975     regnode *next;  /* next node to process when last is reached */
2976     struct scan_frame *prev; /*previous frame*/
2977     I32 stop; /* what stopparen do we use */
2978 } scan_frame;
2979
2980
2981 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2982
2983 #define CASE_SYNST_FNC(nAmE)                                       \
2984 case nAmE:                                                         \
2985     if (flags & SCF_DO_STCLASS_AND) {                              \
2986             for (value = 0; value < 256; value++)                  \
2987                 if (!is_ ## nAmE ## _cp(value))                       \
2988                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2989     }                                                              \
2990     else {                                                         \
2991             for (value = 0; value < 256; value++)                  \
2992                 if (is_ ## nAmE ## _cp(value))                        \
2993                     ANYOF_BITMAP_SET(data->start_class, value);    \
2994     }                                                              \
2995     break;                                                         \
2996 case N ## nAmE:                                                    \
2997     if (flags & SCF_DO_STCLASS_AND) {                              \
2998             for (value = 0; value < 256; value++)                   \
2999                 if (is_ ## nAmE ## _cp(value))                         \
3000                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
3001     }                                                               \
3002     else {                                                          \
3003             for (value = 0; value < 256; value++)                   \
3004                 if (!is_ ## nAmE ## _cp(value))                        \
3005                     ANYOF_BITMAP_SET(data->start_class, value);     \
3006     }                                                               \
3007     break
3008
3009
3010
3011 STATIC I32
3012 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3013                         I32 *minlenp, I32 *deltap,
3014                         regnode *last,
3015                         scan_data_t *data,
3016                         I32 stopparen,
3017                         U8* recursed,
3018                         struct regnode_charclass_class *and_withp,
3019                         U32 flags, U32 depth)
3020                         /* scanp: Start here (read-write). */
3021                         /* deltap: Write maxlen-minlen here. */
3022                         /* last: Stop before this one. */
3023                         /* data: string data about the pattern */
3024                         /* stopparen: treat close N as END */
3025                         /* recursed: which subroutines have we recursed into */
3026                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3027 {
3028     dVAR;
3029     I32 min = 0, pars = 0, code;
3030     regnode *scan = *scanp, *next;
3031     I32 delta = 0;
3032     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3033     int is_inf_internal = 0;            /* The studied chunk is infinite */
3034     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3035     scan_data_t data_fake;
3036     SV *re_trie_maxbuff = NULL;
3037     regnode *first_non_open = scan;
3038     I32 stopmin = I32_MAX;
3039     scan_frame *frame = NULL;
3040     GET_RE_DEBUG_FLAGS_DECL;
3041
3042     PERL_ARGS_ASSERT_STUDY_CHUNK;
3043
3044 #ifdef DEBUGGING
3045     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3046 #endif
3047
3048     if ( depth == 0 ) {
3049         while (first_non_open && OP(first_non_open) == OPEN)
3050             first_non_open=regnext(first_non_open);
3051     }
3052
3053
3054   fake_study_recurse:
3055     while ( scan && OP(scan) != END && scan < last ){
3056         UV min_subtract = 0;    /* How much to subtract from the minimum node
3057                                    length to get a real minimum (because the
3058                                    folded version may be shorter) */
3059         bool has_exactf_sharp_s = FALSE;
3060         /* Peephole optimizer: */
3061         DEBUG_STUDYDATA("Peep:", data,depth);
3062         DEBUG_PEEP("Peep",scan,depth);
3063
3064         /* Its not clear to khw or hv why this is done here, and not in the
3065          * clauses that deal with EXACT nodes.  khw's guess is that it's
3066          * because of a previous design */
3067         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3068
3069         /* Follow the next-chain of the current node and optimize
3070            away all the NOTHINGs from it.  */
3071         if (OP(scan) != CURLYX) {
3072             const int max = (reg_off_by_arg[OP(scan)]
3073                        ? I32_MAX
3074                        /* I32 may be smaller than U16 on CRAYs! */
3075                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3076             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3077             int noff;
3078             regnode *n = scan;
3079
3080             /* Skip NOTHING and LONGJMP. */
3081             while ((n = regnext(n))
3082                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3083                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3084                    && off + noff < max)
3085                 off += noff;
3086             if (reg_off_by_arg[OP(scan)])
3087                 ARG(scan) = off;
3088             else
3089                 NEXT_OFF(scan) = off;
3090         }
3091
3092
3093
3094         /* The principal pseudo-switch.  Cannot be a switch, since we
3095            look into several different things.  */
3096         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3097                    || OP(scan) == IFTHEN) {
3098             next = regnext(scan);
3099             code = OP(scan);
3100             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3101
3102             if (OP(next) == code || code == IFTHEN) {
3103                 /* NOTE - There is similar code to this block below for handling
3104                    TRIE nodes on a re-study.  If you change stuff here check there
3105                    too. */
3106                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3107                 struct regnode_charclass_class accum;
3108                 regnode * const startbranch=scan;
3109
3110                 if (flags & SCF_DO_SUBSTR)
3111                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3112                 if (flags & SCF_DO_STCLASS)
3113                     cl_init_zero(pRExC_state, &accum);
3114
3115                 while (OP(scan) == code) {
3116                     I32 deltanext, minnext, f = 0, fake;
3117                     struct regnode_charclass_class this_class;
3118
3119                     num++;
3120                     data_fake.flags = 0;
3121                     if (data) {
3122                         data_fake.whilem_c = data->whilem_c;
3123                         data_fake.last_closep = data->last_closep;
3124                     }
3125                     else
3126                         data_fake.last_closep = &fake;
3127
3128                     data_fake.pos_delta = delta;
3129                     next = regnext(scan);
3130                     scan = NEXTOPER(scan);
3131                     if (code != BRANCH)
3132                         scan = NEXTOPER(scan);
3133                     if (flags & SCF_DO_STCLASS) {
3134                         cl_init(pRExC_state, &this_class);
3135                         data_fake.start_class = &this_class;
3136                         f = SCF_DO_STCLASS_AND;
3137                     }
3138                     if (flags & SCF_WHILEM_VISITED_POS)
3139                         f |= SCF_WHILEM_VISITED_POS;
3140
3141                     /* we suppose the run is continuous, last=next...*/
3142                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3143                                           next, &data_fake,
3144                                           stopparen, recursed, NULL, f,depth+1);
3145                     if (min1 > minnext)
3146                         min1 = minnext;
3147                     if (max1 < minnext + deltanext)
3148                         max1 = minnext + deltanext;
3149                     if (deltanext == I32_MAX)
3150                         is_inf = is_inf_internal = 1;
3151                     scan = next;
3152                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3153                         pars++;
3154                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3155                         if ( stopmin > minnext)
3156                             stopmin = min + min1;
3157                         flags &= ~SCF_DO_SUBSTR;
3158                         if (data)
3159                             data->flags |= SCF_SEEN_ACCEPT;
3160                     }
3161                     if (data) {
3162                         if (data_fake.flags & SF_HAS_EVAL)
3163                             data->flags |= SF_HAS_EVAL;
3164                         data->whilem_c = data_fake.whilem_c;
3165                     }
3166                     if (flags & SCF_DO_STCLASS)
3167                         cl_or(pRExC_state, &accum, &this_class);
3168                 }
3169                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3170                     min1 = 0;
3171                 if (flags & SCF_DO_SUBSTR) {
3172                     data->pos_min += min1;
3173                     data->pos_delta += max1 - min1;
3174                     if (max1 != min1 || is_inf)
3175                         data->longest = &(data->longest_float);
3176                 }
3177                 min += min1;
3178                 delta += max1 - min1;
3179                 if (flags & SCF_DO_STCLASS_OR) {
3180                     cl_or(pRExC_state, data->start_class, &accum);
3181                     if (min1) {
3182                         cl_and(data->start_class, and_withp);
3183                         flags &= ~SCF_DO_STCLASS;
3184                     }
3185                 }
3186                 else if (flags & SCF_DO_STCLASS_AND) {
3187                     if (min1) {
3188                         cl_and(data->start_class, &accum);
3189                         flags &= ~SCF_DO_STCLASS;
3190                     }
3191                     else {
3192                         /* Switch to OR mode: cache the old value of
3193                          * data->start_class */
3194                         INIT_AND_WITHP;
3195                         StructCopy(data->start_class, and_withp,
3196                                    struct regnode_charclass_class);
3197                         flags &= ~SCF_DO_STCLASS_AND;
3198                         StructCopy(&accum, data->start_class,
3199                                    struct regnode_charclass_class);
3200                         flags |= SCF_DO_STCLASS_OR;
3201                         data->start_class->flags |= ANYOF_EOS;
3202                     }
3203                 }
3204
3205                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3206                 /* demq.
3207
3208                    Assuming this was/is a branch we are dealing with: 'scan' now
3209                    points at the item that follows the branch sequence, whatever
3210                    it is. We now start at the beginning of the sequence and look
3211                    for subsequences of
3212
3213                    BRANCH->EXACT=>x1
3214                    BRANCH->EXACT=>x2
3215                    tail
3216
3217                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3218
3219                    If we can find such a subsequence we need to turn the first
3220                    element into a trie and then add the subsequent branch exact
3221                    strings to the trie.
3222
3223                    We have two cases
3224
3225                      1. patterns where the whole set of branches can be converted.
3226
3227                      2. patterns where only a subset can be converted.
3228
3229                    In case 1 we can replace the whole set with a single regop
3230                    for the trie. In case 2 we need to keep the start and end
3231                    branches so
3232
3233                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3234                      becomes BRANCH TRIE; BRANCH X;
3235
3236                   There is an additional case, that being where there is a
3237                   common prefix, which gets split out into an EXACT like node
3238                   preceding the TRIE node.
3239
3240                   If x(1..n)==tail then we can do a simple trie, if not we make
3241                   a "jump" trie, such that when we match the appropriate word
3242                   we "jump" to the appropriate tail node. Essentially we turn
3243                   a nested if into a case structure of sorts.
3244
3245                 */
3246
3247                     int made=0;
3248                     if (!re_trie_maxbuff) {
3249                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3250                         if (!SvIOK(re_trie_maxbuff))
3251                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3252                     }
3253                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3254                         regnode *cur;
3255                         regnode *first = (regnode *)NULL;
3256                         regnode *last = (regnode *)NULL;
3257                         regnode *tail = scan;
3258                         U8 trietype = 0;
3259                         U32 count=0;
3260
3261 #ifdef DEBUGGING
3262                         SV * const mysv = sv_newmortal();       /* for dumping */
3263 #endif
3264                         /* var tail is used because there may be a TAIL
3265                            regop in the way. Ie, the exacts will point to the
3266                            thing following the TAIL, but the last branch will
3267                            point at the TAIL. So we advance tail. If we
3268                            have nested (?:) we may have to move through several
3269                            tails.
3270                          */
3271
3272                         while ( OP( tail ) == TAIL ) {
3273                             /* this is the TAIL generated by (?:) */
3274                             tail = regnext( tail );
3275                         }
3276
3277
3278                         DEBUG_TRIE_COMPILE_r({
3279                             regprop(RExC_rx, mysv, tail );
3280                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3281                                 (int)depth * 2 + 2, "",
3282                                 "Looking for TRIE'able sequences. Tail node is: ",
3283                                 SvPV_nolen_const( mysv )
3284                             );
3285                         });
3286
3287                         /*
3288
3289                             Step through the branches
3290                                 cur represents each branch,
3291                                 noper is the first thing to be matched as part of that branch
3292                                 noper_next is the regnext() of that node.
3293
3294                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3295                             via a "jump trie" but we also support building with NOJUMPTRIE,
3296                             which restricts the trie logic to structures like /FOO|BAR/.
3297
3298                             If noper is a trieable nodetype then the branch is a possible optimization
3299                             target. If we are building under NOJUMPTRIE then we require that noper_next
3300                             is the same as scan (our current position in the regex program).
3301
3302                             Once we have two or more consecutive such branches we can create a
3303                             trie of the EXACT's contents and stitch it in place into the program.
3304
3305                             If the sequence represents all of the branches in the alternation we
3306                             replace the entire thing with a single TRIE node.
3307
3308                             Otherwise when it is a subsequence we need to stitch it in place and
3309                             replace only the relevant branches. This means the first branch has
3310                             to remain as it is used by the alternation logic, and its next pointer,
3311                             and needs to be repointed at the item on the branch chain following
3312                             the last branch we have optimized away.
3313
3314                             This could be either a BRANCH, in which case the subsequence is internal,
3315                             or it could be the item following the branch sequence in which case the
3316                             subsequence is at the end (which does not necessarily mean the first node
3317                             is the start of the alternation).
3318
3319                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3320
3321                                 optype          |  trietype
3322                                 ----------------+-----------
3323                                 NOTHING         | NOTHING
3324                                 EXACT           | EXACT
3325                                 EXACTFU         | EXACTFU
3326                                 EXACTFU_SS      | EXACTFU
3327                                 EXACTFU_TRICKYFOLD | EXACTFU
3328                                 EXACTFA         | 0
3329
3330
3331                         */
3332 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3333                        ( EXACT == (X) )   ? EXACT :        \
3334                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3335                        0 )
3336
3337                         /* dont use tail as the end marker for this traverse */
3338                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3339                             regnode * const noper = NEXTOPER( cur );
3340                             U8 noper_type = OP( noper );
3341                             U8 noper_trietype = TRIE_TYPE( noper_type );
3342 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3343                             regnode * const noper_next = regnext( noper );
3344                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3345                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3346 #endif
3347
3348                             DEBUG_TRIE_COMPILE_r({
3349                                 regprop(RExC_rx, mysv, cur);
3350                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3351                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3352
3353                                 regprop(RExC_rx, mysv, noper);
3354                                 PerlIO_printf( Perl_debug_log, " -> %s",
3355                                     SvPV_nolen_const(mysv));
3356
3357                                 if ( noper_next ) {
3358                                   regprop(RExC_rx, mysv, noper_next );
3359                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3360                                     SvPV_nolen_const(mysv));
3361                                 }
3362                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3363                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3364                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3365                                 );
3366                             });
3367
3368                             /* Is noper a trieable nodetype that can be merged with the
3369                              * current trie (if there is one)? */
3370                             if ( noper_trietype
3371                                   &&
3372                                   (
3373                                         ( noper_trietype == NOTHING)
3374                                         || ( trietype == NOTHING )
3375                                         || ( trietype == noper_trietype )
3376                                   )
3377 #ifdef NOJUMPTRIE
3378                                   && noper_next == tail
3379 #endif
3380                                   && count < U16_MAX)
3381                             {
3382                                 /* Handle mergable triable node
3383                                  * Either we are the first node in a new trieable sequence,
3384                                  * in which case we do some bookkeeping, otherwise we update
3385                                  * the end pointer. */
3386                                 if ( !first ) {
3387                                     first = cur;
3388                                     if ( noper_trietype == NOTHING ) {
3389 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3390                                         regnode * const noper_next = regnext( noper );
3391                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3392                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3393 #endif
3394
3395                                         if ( noper_next_trietype ) {
3396                                             trietype = noper_next_trietype;
3397                                         } else if (noper_next_type)  {
3398                                             /* a NOTHING regop is 1 regop wide. We need at least two
3399                                              * for a trie so we can't merge this in */
3400                                             first = NULL;
3401                                         }
3402                                     } else {
3403                                         trietype = noper_trietype;
3404                                     }
3405                                 } else {
3406                                     if ( trietype == NOTHING )
3407                                         trietype = noper_trietype;
3408                                     last = cur;
3409                                 }
3410                                 if (first)
3411                                     count++;
3412                             } /* end handle mergable triable node */
3413                             else {
3414                                 /* handle unmergable node -
3415                                  * noper may either be a triable node which can not be tried
3416                                  * together with the current trie, or a non triable node */
3417                                 if ( last ) {
3418                                     /* If last is set and trietype is not NOTHING then we have found
3419                                      * at least two triable branch sequences in a row of a similar
3420                                      * trietype so we can turn them into a trie. If/when we
3421                                      * allow NOTHING to start a trie sequence this condition will be
3422                                      * required, and it isn't expensive so we leave it in for now. */
3423                                     if ( trietype != NOTHING )
3424                                         make_trie( pRExC_state,
3425                                                 startbranch, first, cur, tail, count,
3426                                                 trietype, depth+1 );
3427                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3428                                 }
3429                                 if ( noper_trietype
3430 #ifdef NOJUMPTRIE
3431                                      && noper_next == tail
3432 #endif
3433                                 ){
3434                                     /* noper is triable, so we can start a new trie sequence */
3435                                     count = 1;
3436                                     first = cur;
3437                                     trietype = noper_trietype;
3438                                 } else if (first) {
3439                                     /* if we already saw a first but the current node is not triable then we have
3440                                      * to reset the first information. */
3441                                     count = 0;
3442                                     first = NULL;
3443                                     trietype = 0;
3444                                 }
3445                             } /* end handle unmergable node */
3446                         } /* loop over branches */
3447                         DEBUG_TRIE_COMPILE_r({
3448                             regprop(RExC_rx, mysv, cur);
3449                             PerlIO_printf( Perl_debug_log,
3450                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3451                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3452
3453                         });
3454                         if ( last ) {
3455                             if ( trietype != NOTHING ) {
3456                                 /* the last branch of the sequence was part of a trie,
3457                                  * so we have to construct it here outside of the loop
3458                                  */
3459                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3460 #ifdef TRIE_STUDY_OPT
3461                                 if ( ((made == MADE_EXACT_TRIE &&
3462                                      startbranch == first)
3463                                      || ( first_non_open == first )) &&
3464                                      depth==0 ) {
3465                                     flags |= SCF_TRIE_RESTUDY;
3466                                     if ( startbranch == first
3467                                          && scan == tail )
3468                                     {
3469                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3470                                     }
3471                                 }
3472 #endif
3473                             } else {
3474                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3475                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3476                                  */
3477                                 if ( startbranch == first ) {
3478                                     regnode *opt;
3479                                     /* the entire thing is a NOTHING sequence, something like this:
3480                                      * (?:|) So we can turn it into a plain NOTHING op. */
3481                                     DEBUG_TRIE_COMPILE_r({
3482                                         regprop(RExC_rx, mysv, cur);
3483                                         PerlIO_printf( Perl_debug_log,
3484                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3485                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3486
3487                                     });
3488                                     OP(startbranch)= NOTHING;
3489                                     NEXT_OFF(startbranch)= tail - startbranch;
3490                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3491                                         OP(opt)= OPTIMIZED;
3492                                 }
3493                             }
3494                         } /* end if ( last) */
3495                     } /* TRIE_MAXBUF is non zero */
3496
3497                 } /* do trie */
3498
3499             }
3500             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3501                 scan = NEXTOPER(NEXTOPER(scan));
3502             } else                      /* single branch is optimized. */
3503                 scan = NEXTOPER(scan);
3504             continue;
3505         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3506             scan_frame *newframe = NULL;
3507             I32 paren;
3508             regnode *start;
3509             regnode *end;
3510
3511             if (OP(scan) != SUSPEND) {
3512             /* set the pointer */
3513                 if (OP(scan) == GOSUB) {
3514                     paren = ARG(scan);
3515                     RExC_recurse[ARG2L(scan)] = scan;
3516                     start = RExC_open_parens[paren-1];
3517                     end   = RExC_close_parens[paren-1];
3518                 } else {
3519                     paren = 0;
3520                     start = RExC_rxi->program + 1;
3521                     end   = RExC_opend;
3522                 }
3523                 if (!recursed) {
3524                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3525                     SAVEFREEPV(recursed);
3526                 }
3527                 if (!PAREN_TEST(recursed,paren+1)) {
3528                     PAREN_SET(recursed,paren+1);
3529                     Newx(newframe,1,scan_frame);
3530                 } else {
3531                     if (flags & SCF_DO_SUBSTR) {
3532                         SCAN_COMMIT(pRExC_state,data,minlenp);
3533                         data->longest = &(data->longest_float);
3534                     }
3535                     is_inf = is_inf_internal = 1;
3536                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3537                         cl_anything(pRExC_state, data->start_class);
3538                     flags &= ~SCF_DO_STCLASS;
3539                 }
3540             } else {
3541                 Newx(newframe,1,scan_frame);
3542                 paren = stopparen;
3543                 start = scan+2;
3544                 end = regnext(scan);
3545             }
3546             if (newframe) {
3547                 assert(start);
3548                 assert(end);
3549                 SAVEFREEPV(newframe);
3550                 newframe->next = regnext(scan);
3551                 newframe->last = last;
3552                 newframe->stop = stopparen;
3553                 newframe->prev = frame;
3554
3555                 frame = newframe;
3556                 scan =  start;
3557                 stopparen = paren;
3558                 last = end;
3559
3560                 continue;
3561             }
3562         }
3563         else if (OP(scan) == EXACT) {
3564             I32 l = STR_LEN(scan);
3565             UV uc;
3566             if (UTF) {
3567                 const U8 * const s = (U8*)STRING(scan);
3568                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3569                 l = utf8_length(s, s + l);
3570             } else {
3571                 uc = *((U8*)STRING(scan));
3572             }
3573             min += l;
3574             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3575                 /* The code below prefers earlier match for fixed
3576                    offset, later match for variable offset.  */
3577                 if (data->last_end == -1) { /* Update the start info. */
3578                     data->last_start_min = data->pos_min;
3579                     data->last_start_max = is_inf
3580                         ? I32_MAX : data->pos_min + data->pos_delta;
3581                 }
3582                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3583                 if (UTF)
3584                     SvUTF8_on(data->last_found);
3585                 {
3586                     SV * const sv = data->last_found;
3587                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3588                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3589                     if (mg && mg->mg_len >= 0)
3590                         mg->mg_len += utf8_length((U8*)STRING(scan),
3591                                                   (U8*)STRING(scan)+STR_LEN(scan));
3592                 }
3593                 data->last_end = data->pos_min + l;
3594                 data->pos_min += l; /* As in the first entry. */
3595                 data->flags &= ~SF_BEFORE_EOL;
3596             }
3597             if (flags & SCF_DO_STCLASS_AND) {
3598                 /* Check whether it is compatible with what we know already! */
3599                 int compat = 1;
3600
3601
3602                 /* If compatible, we or it in below.  It is compatible if is
3603                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3604                  * it's for a locale.  Even if there isn't unicode semantics
3605                  * here, at runtime there may be because of matching against a
3606                  * utf8 string, so accept a possible false positive for
3607                  * latin1-range folds */
3608                 if (uc >= 0x100 ||
3609                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3610                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3611                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3612                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3613                     )
3614                 {
3615                     compat = 0;
3616                 }
3617                 ANYOF_CLASS_ZERO(data->start_class);
3618                 ANYOF_BITMAP_ZERO(data->start_class);
3619                 if (compat)
3620                     ANYOF_BITMAP_SET(data->start_class, uc);
3621                 else if (uc >= 0x100) {
3622                     int i;
3623
3624                     /* Some Unicode code points fold to the Latin1 range; as
3625                      * XXX temporary code, instead of figuring out if this is
3626                      * one, just assume it is and set all the start class bits
3627                      * that could be some such above 255 code point's fold
3628                      * which will generate fals positives.  As the code
3629                      * elsewhere that does compute the fold settles down, it
3630                      * can be extracted out and re-used here */
3631                     for (i = 0; i < 256; i++){
3632                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3633                             ANYOF_BITMAP_SET(data->start_class, i);
3634                         }
3635                     }
3636                 }
3637                 data->start_class->flags &= ~ANYOF_EOS;
3638                 if (uc < 0x100)
3639                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3640             }
3641             else if (flags & SCF_DO_STCLASS_OR) {
3642                 /* false positive possible if the class is case-folded */
3643                 if (uc < 0x100)
3644                     ANYOF_BITMAP_SET(data->start_class, uc);
3645                 else
3646                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3647                 data->start_class->flags &= ~ANYOF_EOS;
3648                 cl_and(data->start_class, and_withp);
3649             }
3650             flags &= ~SCF_DO_STCLASS;
3651         }
3652         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3653             I32 l = STR_LEN(scan);
3654             UV uc = *((U8*)STRING(scan));
3655
3656             /* Search for fixed substrings supports EXACT only. */
3657             if (flags & SCF_DO_SUBSTR) {
3658                 assert(data);
3659                 SCAN_COMMIT(pRExC_state, data, minlenp);
3660             }
3661             if (UTF) {
3662                 const U8 * const s = (U8 *)STRING(scan);
3663                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3664                 l = utf8_length(s, s + l);
3665             }
3666             if (has_exactf_sharp_s) {
3667                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3668             }
3669             min += l - min_subtract;
3670             if (min < 0) {
3671                 min = 0;
3672             }
3673             delta += min_subtract;
3674             if (flags & SCF_DO_SUBSTR) {
3675                 data->pos_min += l - min_subtract;
3676                 if (data->pos_min < 0) {
3677                     data->pos_min = 0;
3678                 }
3679                 data->pos_delta += min_subtract;
3680                 if (min_subtract) {
3681                     data->longest = &(data->longest_float);
3682                 }
3683             }
3684             if (flags & SCF_DO_STCLASS_AND) {
3685                 /* Check whether it is compatible with what we know already! */
3686                 int compat = 1;
3687                 if (uc >= 0x100 ||
3688                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3689                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3690                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3691                 {
3692                     compat = 0;
3693                 }
3694                 ANYOF_CLASS_ZERO(data->start_class);
3695                 ANYOF_BITMAP_ZERO(data->start_class);
3696                 if (compat) {
3697                     ANYOF_BITMAP_SET(data->start_class, uc);
3698                     data->start_class->flags &= ~ANYOF_EOS;
3699                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3700                     if (OP(scan) == EXACTFL) {
3701                         /* XXX This set is probably no longer necessary, and
3702                          * probably wrong as LOCALE now is on in the initial
3703                          * state */
3704                         data->start_class->flags |= ANYOF_LOCALE;
3705                     }
3706                     else {
3707
3708                         /* Also set the other member of the fold pair.  In case
3709                          * that unicode semantics is called for at runtime, use
3710                          * the full latin1 fold.  (Can't do this for locale,
3711                          * because not known until runtime) */
3712                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3713
3714                         /* All other (EXACTFL handled above) folds except under
3715                          * /iaa that include s, S, and sharp_s also may include
3716                          * the others */
3717                         if (OP(scan) != EXACTFA) {
3718                             if (uc == 's' || uc == 'S') {
3719                                 ANYOF_BITMAP_SET(data->start_class,
3720                                                  LATIN_SMALL_LETTER_SHARP_S);
3721                             }
3722                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3723                                 ANYOF_BITMAP_SET(data->start_class, 's');
3724                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3725                             }
3726                         }
3727                     }
3728                 }
3729                 else if (uc >= 0x100) {
3730                     int i;
3731                     for (i = 0; i < 256; i++){
3732                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3733                             ANYOF_BITMAP_SET(data->start_class, i);
3734                         }
3735                     }
3736                 }
3737             }
3738             else if (flags & SCF_DO_STCLASS_OR) {
3739                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3740                     /* false positive possible if the class is case-folded.
3741                        Assume that the locale settings are the same... */
3742                     if (uc < 0x100) {
3743                         ANYOF_BITMAP_SET(data->start_class, uc);
3744                         if (OP(scan) != EXACTFL) {
3745
3746                             /* And set the other member of the fold pair, but
3747                              * can't do that in locale because not known until
3748                              * run-time */
3749                             ANYOF_BITMAP_SET(data->start_class,
3750                                              PL_fold_latin1[uc]);
3751
3752                             /* All folds except under /iaa that include s, S,
3753                              * and sharp_s also may include the others */
3754                             if (OP(scan) != EXACTFA) {
3755                                 if (uc == 's' || uc == 'S') {
3756                                     ANYOF_BITMAP_SET(data->start_class,
3757                                                    LATIN_SMALL_LETTER_SHARP_S);
3758                                 }
3759                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3760                                     ANYOF_BITMAP_SET(data->start_class, 's');
3761                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3762                                 }
3763                             }
3764                         }
3765                     }
3766                     data->start_class->flags &= ~ANYOF_EOS;
3767                 }
3768                 cl_and(data->start_class, and_withp);
3769             }
3770             flags &= ~SCF_DO_STCLASS;
3771         }
3772         else if (REGNODE_VARIES(OP(scan))) {
3773             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3774             I32 f = flags, pos_before = 0;
3775             regnode * const oscan = scan;
3776             struct regnode_charclass_class this_class;
3777             struct regnode_charclass_class *oclass = NULL;
3778             I32 next_is_eval = 0;
3779
3780             switch (PL_regkind[OP(scan)]) {
3781             case WHILEM:                /* End of (?:...)* . */
3782                 scan = NEXTOPER(scan);
3783                 goto finish;
3784             case PLUS:
3785                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3786                     next = NEXTOPER(scan);
3787                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3788                         mincount = 1;
3789                         maxcount = REG_INFTY;
3790                         next = regnext(scan);
3791                         scan = NEXTOPER(scan);
3792                         goto do_curly;
3793                     }
3794                 }
3795                 if (flags & SCF_DO_SUBSTR)
3796                     data->pos_min++;
3797                 min++;
3798                 /* Fall through. */
3799             case STAR:
3800                 if (flags & SCF_DO_STCLASS) {
3801                     mincount = 0;
3802                     maxcount = REG_INFTY;
3803                     next = regnext(scan);
3804                     scan = NEXTOPER(scan);
3805                     goto do_curly;
3806                 }
3807                 is_inf = is_inf_internal = 1;
3808                 scan = regnext(scan);
3809                 if (flags & SCF_DO_SUBSTR) {
3810                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3811                     data->longest = &(data->longest_float);
3812                 }
3813                 goto optimize_curly_tail;
3814             case CURLY:
3815                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3816                     && (scan->flags == stopparen))
3817                 {
3818                     mincount = 1;
3819                     maxcount = 1;
3820                 } else {
3821                     mincount = ARG1(scan);
3822                     maxcount = ARG2(scan);
3823                 }
3824                 next = regnext(scan);
3825                 if (OP(scan) == CURLYX) {
3826                     I32 lp = (data ? *(data->last_closep) : 0);
3827                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3828                 }
3829                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3830                 next_is_eval = (OP(scan) == EVAL);
3831               do_curly:
3832                 if (flags & SCF_DO_SUBSTR) {
3833                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3834                     pos_before = data->pos_min;
3835                 }
3836                 if (data) {
3837                     fl = data->flags;
3838                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3839                     if (is_inf)
3840                         data->flags |= SF_IS_INF;
3841                 }
3842                 if (flags & SCF_DO_STCLASS) {
3843                     cl_init(pRExC_state, &this_class);
3844                     oclass = data->start_class;
3845                     data->start_class = &this_class;
3846                     f |= SCF_DO_STCLASS_AND;
3847                     f &= ~SCF_DO_STCLASS_OR;
3848                 }
3849                 /* Exclude from super-linear cache processing any {n,m}
3850                    regops for which the combination of input pos and regex
3851                    pos is not enough information to determine if a match
3852                    will be possible.
3853
3854                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3855                    regex pos at the \s*, the prospects for a match depend not
3856                    only on the input position but also on how many (bar\s*)
3857                    repeats into the {4,8} we are. */
3858                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3859                     f &= ~SCF_WHILEM_VISITED_POS;
3860
3861                 /* This will finish on WHILEM, setting scan, or on NULL: */
3862                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3863                                       last, data, stopparen, recursed, NULL,
3864                                       (mincount == 0
3865                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3866
3867                 if (flags & SCF_DO_STCLASS)
3868                     data->start_class = oclass;
3869                 if (mincount == 0 || minnext == 0) {
3870                     if (flags & SCF_DO_STCLASS_OR) {
3871                         cl_or(pRExC_state, data->start_class, &this_class);
3872                     }
3873                     else if (flags & SCF_DO_STCLASS_AND) {
3874                         /* Switch to OR mode: cache the old value of
3875                          * data->start_class */
3876                         INIT_AND_WITHP;
3877                         StructCopy(data->start_class, and_withp,
3878                                    struct regnode_charclass_class);
3879                         flags &= ~SCF_DO_STCLASS_AND;
3880                         StructCopy(&this_class, data->start_class,
3881                                    struct regnode_charclass_class);
3882                         flags |= SCF_DO_STCLASS_OR;
3883                         data->start_class->flags |= ANYOF_EOS;
3884                     }
3885                 } else {                /* Non-zero len */
3886                     if (flags & SCF_DO_STCLASS_OR) {
3887                         cl_or(pRExC_state, data->start_class, &this_class);
3888                         cl_and(data->start_class, and_withp);
3889                     }
3890                     else if (flags & SCF_DO_STCLASS_AND)
3891                         cl_and(data->start_class, &this_class);
3892                     flags &= ~SCF_DO_STCLASS;
3893                 }
3894                 if (!scan)              /* It was not CURLYX, but CURLY. */
3895                     scan = next;
3896                 if ( /* ? quantifier ok, except for (?{ ... }) */
3897                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3898                     && (minnext == 0) && (deltanext == 0)
3899                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3900                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3901                 {
3902                     ckWARNreg(RExC_parse,
3903                               "Quantifier unexpected on zero-length expression");
3904                 }
3905
3906                 min += minnext * mincount;
3907                 is_inf_internal |= ((maxcount == REG_INFTY
3908                                      && (minnext + deltanext) > 0)
3909                                     || deltanext == I32_MAX);
3910                 is_inf |= is_inf_internal;
3911                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3912
3913                 /* Try powerful optimization CURLYX => CURLYN. */
3914                 if (  OP(oscan) == CURLYX && data
3915                       && data->flags & SF_IN_PAR
3916                       && !(data->flags & SF_HAS_EVAL)
3917                       && !deltanext && minnext == 1 ) {
3918                     /* Try to optimize to CURLYN.  */
3919                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3920                     regnode * const nxt1 = nxt;
3921 #ifdef DEBUGGING
3922                     regnode *nxt2;
3923 #endif
3924
3925                     /* Skip open. */
3926                     nxt = regnext(nxt);
3927                     if (!REGNODE_SIMPLE(OP(nxt))
3928                         && !(PL_regkind[OP(nxt)] == EXACT
3929                              && STR_LEN(nxt) == 1))
3930                         goto nogo;
3931 #ifdef DEBUGGING
3932                     nxt2 = nxt;
3933 #endif
3934                     nxt = regnext(nxt);
3935                     if (OP(nxt) != CLOSE)
3936                         goto nogo;
3937                     if (RExC_open_parens) {
3938                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3939                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3940                     }
3941                     /* Now we know that nxt2 is the only contents: */
3942                     oscan->flags = (U8)ARG(nxt);
3943                     OP(oscan) = CURLYN;
3944                     OP(nxt1) = NOTHING; /* was OPEN. */
3945
3946 #ifdef DEBUGGING
3947                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3948                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3949                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3950                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3951                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3952                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3953 #endif
3954                 }
3955               nogo:
3956
3957                 /* Try optimization CURLYX => CURLYM. */
3958                 if (  OP(oscan) == CURLYX && data
3959                       && !(data->flags & SF_HAS_PAR)
3960                       && !(data->flags & SF_HAS_EVAL)
3961                       && !deltanext     /* atom is fixed width */
3962                       && minnext != 0   /* CURLYM can't handle zero width */
3963                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3964                 ) {
3965                     /* XXXX How to optimize if data == 0? */
3966                     /* Optimize to a simpler form.  */
3967                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3968                     regnode *nxt2;
3969
3970                     OP(oscan) = CURLYM;
3971                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3972                             && (OP(nxt2) != WHILEM))
3973                         nxt = nxt2;
3974                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3975                     /* Need to optimize away parenths. */
3976                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3977                         /* Set the parenth number.  */
3978                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3979
3980                         oscan->flags = (U8)ARG(nxt);
3981                         if (RExC_open_parens) {
3982                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3983                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3984                         }
3985                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3986                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3987
3988 #ifdef DEBUGGING
3989                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3990                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3991                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3992                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3993 #endif
3994 #if 0
3995                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3996                             regnode *nnxt = regnext(nxt1);
3997                             if (nnxt == nxt) {
3998                                 if (reg_off_by_arg[OP(nxt1)])
3999                                     ARG_SET(nxt1, nxt2 - nxt1);
4000                                 else if (nxt2 - nxt1 < U16_MAX)
4001                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4002                                 else
4003                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4004                             }
4005                             nxt1 = nnxt;
4006                         }
4007 #endif
4008                         /* Optimize again: */
4009                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4010                                     NULL, stopparen, recursed, NULL, 0,depth+1);
4011                     }
4012                     else
4013                         oscan->flags = 0;
4014                 }
4015                 else if ((OP(oscan) == CURLYX)
4016                          && (flags & SCF_WHILEM_VISITED_POS)
4017                          /* See the comment on a similar expression above.
4018                             However, this time it's not a subexpression
4019                             we care about, but the expression itself. */
4020                          && (maxcount == REG_INFTY)
4021                          && data && ++data->whilem_c < 16) {
4022                     /* This stays as CURLYX, we can put the count/of pair. */
4023                     /* Find WHILEM (as in regexec.c) */
4024                     regnode *nxt = oscan + NEXT_OFF(oscan);
4025
4026                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4027                         nxt += ARG(nxt);
4028                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4029                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4030                 }
4031                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4032                     pars++;
4033                 if (flags & SCF_DO_SUBSTR) {
4034                     SV *last_str = NULL;
4035                     int counted = mincount != 0;
4036
4037                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4038 #if defined(SPARC64_GCC_WORKAROUND)
4039                         I32 b = 0;
4040                         STRLEN l = 0;
4041                         const char *s = NULL;
4042                         I32 old = 0;
4043
4044                         if (pos_before >= data->last_start_min)
4045                             b = pos_before;
4046                         else
4047                             b = data->last_start_min;
4048
4049                         l = 0;
4050                         s = SvPV_const(data->last_found, l);
4051                         old = b - data->last_start_min;
4052
4053 #else
4054                         I32 b = pos_before >= data->last_start_min
4055                             ? pos_before : data->last_start_min;
4056                         STRLEN l;
4057                         const char * const s = SvPV_const(data->last_found, l);
4058                         I32 old = b - data->last_start_min;
4059 #endif
4060
4061                         if (UTF)
4062                             old = utf8_hop((U8*)s, old) - (U8*)s;
4063                         l -= old;
4064                         /* Get the added string: */
4065                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4066                         if (deltanext == 0 && pos_before == b) {
4067                             /* What was added is a constant string */
4068                             if (mincount > 1) {
4069                                 SvGROW(last_str, (mincount * l) + 1);
4070                                 repeatcpy(SvPVX(last_str) + l,
4071                                           SvPVX_const(last_str), l, mincount - 1);
4072                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4073                                 /* Add additional parts. */
4074                                 SvCUR_set(data->last_found,
4075                                           SvCUR(data->last_found) - l);
4076                                 sv_catsv(data->last_found, last_str);
4077                                 {
4078                                     SV * sv = data->last_found;
4079                                     MAGIC *mg =
4080                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4081                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4082                                     if (mg && mg->mg_len >= 0)
4083                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4084                                 }
4085                                 data->last_end += l * (mincount - 1);
4086                             }
4087                         } else {
4088                             /* start offset must point into the last copy */
4089                             data->last_start_min += minnext * (mincount - 1);
4090                             data->last_start_max += is_inf ? I32_MAX
4091                                 : (maxcount - 1) * (minnext + data->pos_delta);
4092                         }
4093                     }
4094                     /* It is counted once already... */
4095                     data->pos_min += minnext * (mincount - counted);
4096                     data->pos_delta += - counted * deltanext +
4097                         (minnext + deltanext) * maxcount - minnext * mincount;
4098                     if (mincount != maxcount) {
4099                          /* Cannot extend fixed substrings found inside
4100                             the group.  */
4101                         SCAN_COMMIT(pRExC_state,data,minlenp);
4102                         if (mincount && last_str) {
4103                             SV * const sv = data->last_found;
4104                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4105                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4106
4107                             if (mg)
4108                                 mg->mg_len = -1;
4109                             sv_setsv(sv, last_str);
4110                             data->last_end = data->pos_min;
4111                             data->last_start_min =
4112                                 data->pos_min - CHR_SVLEN(last_str);
4113                             data->last_start_max = is_inf
4114                                 ? I32_MAX
4115                                 : data->pos_min + data->pos_delta
4116                                 - CHR_SVLEN(last_str);
4117                         }
4118                         data->longest = &(data->longest_float);
4119                     }
4120                     SvREFCNT_dec(last_str);
4121                 }
4122                 if (data && (fl & SF_HAS_EVAL))
4123                     data->flags |= SF_HAS_EVAL;
4124               optimize_curly_tail:
4125                 if (OP(oscan) != CURLYX) {
4126                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4127                            && NEXT_OFF(next))
4128                         NEXT_OFF(oscan) += NEXT_OFF(next);
4129                 }
4130                 continue;
4131             default:                    /* REF, ANYOFV, and CLUMP only? */
4132                 if (flags & SCF_DO_SUBSTR) {
4133                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4134                     data->longest = &(data->longest_float);
4135                 }
4136                 is_inf = is_inf_internal = 1;
4137                 if (flags & SCF_DO_STCLASS_OR)
4138                     cl_anything(pRExC_state, data->start_class);
4139                 flags &= ~SCF_DO_STCLASS;
4140                 break;
4141             }
4142         }
4143         else if (OP(scan) == LNBREAK) {
4144             if (flags & SCF_DO_STCLASS) {
4145                 int value = 0;
4146                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4147                 if (flags & SCF_DO_STCLASS_AND) {
4148                     for (value = 0; value < 256; value++)
4149                         if (!is_VERTWS_cp(value))
4150                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4151                 }
4152                 else {
4153                     for (value = 0; value < 256; value++)
4154                         if (is_VERTWS_cp(value))
4155                             ANYOF_BITMAP_SET(data->start_class, value);
4156                 }
4157                 if (flags & SCF_DO_STCLASS_OR)
4158                     cl_and(data->start_class, and_withp);
4159                 flags &= ~SCF_DO_STCLASS;
4160             }
4161             min += 1;
4162             delta += 1;
4163             if (flags & SCF_DO_SUBSTR) {
4164                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4165                 data->pos_min += 1;
4166                 data->pos_delta += 1;
4167                 data->longest = &(data->longest_float);
4168             }
4169         }
4170         else if (REGNODE_SIMPLE(OP(scan))) {
4171             int value = 0;
4172
4173             if (flags & SCF_DO_SUBSTR) {
4174                 SCAN_COMMIT(pRExC_state,data,minlenp);
4175                 data->pos_min++;
4176             }
4177             min++;
4178             if (flags & SCF_DO_STCLASS) {
4179                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4180
4181                 /* Some of the logic below assumes that switching
4182                    locale on will only add false positives. */
4183                 switch (PL_regkind[OP(scan)]) {
4184                 case SANY:
4185                 default:
4186                   do_default:
4187                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4188                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4189                         cl_anything(pRExC_state, data->start_class);
4190                     break;
4191                 case REG_ANY:
4192                     if (OP(scan) == SANY)
4193                         goto do_default;
4194                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4195                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4196                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4197                         cl_anything(pRExC_state, data->start_class);
4198                     }
4199                     if (flags & SCF_DO_STCLASS_AND || !value)
4200                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4201                     break;
4202                 case ANYOF:
4203                     if (flags & SCF_DO_STCLASS_AND)
4204                         cl_and(data->start_class,
4205                                (struct regnode_charclass_class*)scan);
4206                     else
4207                         cl_or(pRExC_state, data->start_class,
4208                               (struct regnode_charclass_class*)scan);
4209                     break;
4210                 case ALNUM:
4211                     if (flags & SCF_DO_STCLASS_AND) {
4212                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4213                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4214                             if (OP(scan) == ALNUMU) {
4215                                 for (value = 0; value < 256; value++) {
4216                                     if (!isWORDCHAR_L1(value)) {
4217                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4218                                     }
4219                                 }
4220                             } else {
4221                                 for (value = 0; value < 256; value++) {
4222                                     if (!isALNUM(value)) {
4223                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4224                                     }
4225                                 }
4226                             }
4227                         }
4228                     }
4229                     else {
4230                         if (data->start_class->flags & ANYOF_LOCALE)
4231                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4232
4233                         /* Even if under locale, set the bits for non-locale
4234                          * in case it isn't a true locale-node.  This will
4235                          * create false positives if it truly is locale */
4236                         if (OP(scan) == ALNUMU) {
4237                             for (value = 0; value < 256; value++) {
4238                                 if (isWORDCHAR_L1(value)) {
4239                                     ANYOF_BITMAP_SET(data->start_class, value);
4240                                 }
4241                             }
4242                         } else {
4243                             for (value = 0; value < 256; value++) {
4244                                 if (isALNUM(value)) {
4245                                     ANYOF_BITMAP_SET(data->start_class, value);
4246                                 }
4247                             }
4248                         }
4249                     }
4250                     break;
4251                 case NALNUM:
4252                     if (flags & SCF_DO_STCLASS_AND) {
4253                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4254                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4255                             if (OP(scan) == NALNUMU) {
4256                                 for (value = 0; value < 256; value++) {
4257                                     if (isWORDCHAR_L1(value)) {
4258                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4259                                     }
4260                                 }
4261                             } else {
4262                                 for (value = 0; value < 256; value++) {
4263                                     if (isALNUM(value)) {
4264                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4265                                     }
4266                                 }
4267                             }
4268                         }
4269                     }
4270                     else {
4271                         if (data->start_class->flags & ANYOF_LOCALE)
4272                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4273
4274                         /* Even if under locale, set the bits for non-locale in
4275                          * case it isn't a true locale-node.  This will create
4276                          * false positives if it truly is locale */
4277                         if (OP(scan) == NALNUMU) {
4278                             for (value = 0; value < 256; value++) {
4279                                 if (! isWORDCHAR_L1(value)) {
4280                                     ANYOF_BITMAP_SET(data->start_class, value);
4281                                 }
4282                             }
4283                         } else {
4284                             for (value = 0; value < 256; value++) {
4285                                 if (! isALNUM(value)) {
4286                                     ANYOF_BITMAP_SET(data->start_class, value);
4287                                 }
4288                             }
4289                         }
4290                     }
4291                     break;
4292                 case SPACE:
4293                     if (flags & SCF_DO_STCLASS_AND) {
4294                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4295                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4296                             if (OP(scan) == SPACEU) {
4297                                 for (value = 0; value < 256; value++) {
4298                                     if (!isSPACE_L1(value)) {
4299                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4300                                     }
4301                                 }
4302                             } else {
4303                                 for (value = 0; value < 256; value++) {
4304                                     if (!isSPACE(value)) {
4305                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4306                                     }
4307                                 }
4308                             }
4309                         }
4310                     }
4311                     else {
4312                         if (data->start_class->flags & ANYOF_LOCALE) {
4313                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4314                         }
4315                         if (OP(scan) == SPACEU) {
4316                             for (value = 0; value < 256; value++) {
4317                                 if (isSPACE_L1(value)) {
4318                                     ANYOF_BITMAP_SET(data->start_class, value);
4319                                 }
4320                             }
4321                         } else {
4322                             for (value = 0; value < 256; value++) {
4323                                 if (isSPACE(value)) {
4324                                     ANYOF_BITMAP_SET(data->start_class, value);
4325                                 }
4326                             }
4327                         }
4328                     }
4329                     break;
4330                 case NSPACE:
4331                     if (flags & SCF_DO_STCLASS_AND) {
4332                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4333                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4334                             if (OP(scan) == NSPACEU) {
4335                                 for (value = 0; value < 256; value++) {
4336                                     if (isSPACE_L1(value)) {
4337                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4338                                     }
4339                                 }
4340                             } else {
4341                                 for (value = 0; value < 256; value++) {
4342                                     if (isSPACE(value)) {
4343                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4344                                     }
4345                                 }
4346                             }
4347                         }
4348                     }
4349                     else {
4350                         if (data->start_class->flags & ANYOF_LOCALE)
4351                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4352                         if (OP(scan) == NSPACEU) {
4353                             for (value = 0; value < 256; value++) {
4354                                 if (!isSPACE_L1(value)) {
4355                                     ANYOF_BITMAP_SET(data->start_class, value);
4356                                 }
4357                             }
4358                         }
4359                         else {
4360                             for (value = 0; value < 256; value++) {
4361                                 if (!isSPACE(value)) {
4362                                     ANYOF_BITMAP_SET(data->start_class, value);
4363                                 }
4364                             }
4365                         }
4366                     }
4367                     break;
4368                 case DIGIT:
4369                     if (flags & SCF_DO_STCLASS_AND) {
4370                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4371                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4372                             for (value = 0; value < 256; value++)
4373                                 if (!isDIGIT(value))
4374                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4375                         }
4376                     }
4377                     else {
4378                         if (data->start_class->flags & ANYOF_LOCALE)
4379                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4380                         for (value = 0; value < 256; value++)
4381                             if (isDIGIT(value))
4382                                 ANYOF_BITMAP_SET(data->start_class, value);
4383                     }
4384                     break;
4385                 case NDIGIT:
4386                     if (flags & SCF_DO_STCLASS_AND) {
4387                         if (!(data->start_class->flags & ANYOF_LOCALE))
4388                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4389                         for (value = 0; value < 256; value++)
4390                             if (isDIGIT(value))
4391                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4392                     }
4393                     else {
4394                         if (data->start_class->flags & ANYOF_LOCALE)
4395                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4396                         for (value = 0; value < 256; value++)
4397                             if (!isDIGIT(value))
4398                                 ANYOF_BITMAP_SET(data->start_class, value);
4399                     }
4400                     break;
4401                 CASE_SYNST_FNC(VERTWS);
4402                 CASE_SYNST_FNC(HORIZWS);
4403
4404                 }
4405                 if (flags & SCF_DO_STCLASS_OR)
4406                     cl_and(data->start_class, and_withp);
4407                 flags &= ~SCF_DO_STCLASS;
4408             }
4409         }
4410         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4411             data->flags |= (OP(scan) == MEOL
4412                             ? SF_BEFORE_MEOL
4413                             : SF_BEFORE_SEOL);
4414             SCAN_COMMIT(pRExC_state, data, minlenp);
4415
4416         }
4417         else if (  PL_regkind[OP(scan)] == BRANCHJ
4418                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4419                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4420                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4421             if ( OP(scan) == UNLESSM &&
4422                  scan->flags == 0 &&
4423                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4424                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4425             ) {
4426                 regnode *opt;
4427                 regnode *upto= regnext(scan);
4428                 DEBUG_PARSE_r({
4429                     SV * const mysv_val=sv_newmortal();
4430                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4431
4432                     /*DEBUG_PARSE_MSG("opfail");*/
4433                     regprop(RExC_rx, mysv_val, upto);
4434                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4435                                   SvPV_nolen_const(mysv_val),
4436                                   (IV)REG_NODE_NUM(upto),
4437                                   (IV)(upto - scan)
4438                     );
4439                 });
4440                 OP(scan) = OPFAIL;
4441                 NEXT_OFF(scan) = upto - scan;
4442                 for (opt= scan + 1; opt < upto ; opt++)
4443                     OP(opt) = OPTIMIZED;
4444                 scan= upto;
4445                 continue;
4446             }
4447             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4448                 || OP(scan) == UNLESSM )
4449             {
4450                 /* Negative Lookahead/lookbehind
4451                    In this case we can't do fixed string optimisation.
4452                 */
4453
4454                 I32 deltanext, minnext, fake = 0;
4455                 regnode *nscan;
4456                 struct regnode_charclass_class intrnl;
4457                 int f = 0;
4458
4459                 data_fake.flags = 0;
4460                 if (data) {
4461                     data_fake.whilem_c = data->whilem_c;
4462                     data_fake.last_closep = data->last_closep;
4463                 }
4464                 else
4465                     data_fake.last_closep = &fake;
4466                 data_fake.pos_delta = delta;
4467                 if ( flags & SCF_DO_STCLASS && !scan->flags
4468                      && OP(scan) == IFMATCH ) { /* Lookahead */
4469                     cl_init(pRExC_state, &intrnl);
4470                     data_fake.start_class = &intrnl;
4471                     f |= SCF_DO_STCLASS_AND;
4472                 }
4473                 if (flags & SCF_WHILEM_VISITED_POS)
4474                     f |= SCF_WHILEM_VISITED_POS;
4475                 next = regnext(scan);
4476                 nscan = NEXTOPER(NEXTOPER(scan));
4477                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4478                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4479                 if (scan->flags) {
4480                     if (deltanext) {
4481                         FAIL("Variable length lookbehind not implemented");
4482                     }
4483                     else if (minnext > (I32)U8_MAX) {
4484                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4485                     }
4486                     scan->flags = (U8)minnext;
4487                 }
4488                 if (data) {
4489                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4490                         pars++;
4491                     if (data_fake.flags & SF_HAS_EVAL)
4492                         data->flags |= SF_HAS_EVAL;
4493                     data->whilem_c = data_fake.whilem_c;
4494                 }
4495                 if (f & SCF_DO_STCLASS_AND) {
4496                     if (flags & SCF_DO_STCLASS_OR) {
4497                         /* OR before, AND after: ideally we would recurse with
4498                          * data_fake to get the AND applied by study of the
4499                          * remainder of the pattern, and then derecurse;
4500                          * *** HACK *** for now just treat as "no information".
4501                          * See [perl #56690].
4502                          */
4503                         cl_init(pRExC_state, data->start_class);
4504                     }  else {
4505                         /* AND before and after: combine and continue */
4506                         const int was = (data->start_class->flags & ANYOF_EOS);
4507
4508                         cl_and(data->start_class, &intrnl);
4509                         if (was)
4510                             data->start_class->flags |= ANYOF_EOS;
4511                     }
4512                 }
4513             }
4514 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4515             else {
4516                 /* Positive Lookahead/lookbehind
4517                    In this case we can do fixed string optimisation,
4518                    but we must be careful about it. Note in the case of
4519                    lookbehind the positions will be offset by the minimum
4520                    length of the pattern, something we won't know about
4521                    until after the recurse.
4522                 */
4523                 I32 deltanext, fake = 0;
4524                 regnode *nscan;
4525                 struct regnode_charclass_class intrnl;
4526                 int f = 0;
4527                 /* We use SAVEFREEPV so that when the full compile
4528                     is finished perl will clean up the allocated
4529                     minlens when it's all done. This way we don't
4530                     have to worry about freeing them when we know
4531                     they wont be used, which would be a pain.
4532                  */
4533                 I32 *minnextp;
4534                 Newx( minnextp, 1, I32 );
4535                 SAVEFREEPV(minnextp);
4536
4537                 if (data) {
4538                     StructCopy(data, &data_fake, scan_data_t);
4539                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4540                         f |= SCF_DO_SUBSTR;
4541                         if (scan->flags)
4542                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4543                         data_fake.last_found=newSVsv(data->last_found);
4544                     }
4545                 }
4546                 else
4547                     data_fake.last_closep = &fake;
4548                 data_fake.flags = 0;
4549                 data_fake.pos_delta = delta;
4550                 if (is_inf)
4551                     data_fake.flags |= SF_IS_INF;
4552                 if ( flags & SCF_DO_STCLASS && !scan->flags
4553                      && OP(scan) == IFMATCH ) { /* Lookahead */
4554                     cl_init(pRExC_state, &intrnl);
4555                     data_fake.start_class = &intrnl;
4556                     f |= SCF_DO_STCLASS_AND;
4557                 }
4558                 if (flags & SCF_WHILEM_VISITED_POS)
4559                     f |= SCF_WHILEM_VISITED_POS;
4560                 next = regnext(scan);
4561                 nscan = NEXTOPER(NEXTOPER(scan));
4562
4563                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4564                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4565                 if (scan->flags) {
4566                     if (deltanext) {
4567                         FAIL("Variable length lookbehind not implemented");
4568                     }
4569                     else if (*minnextp > (I32)U8_MAX) {
4570                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4571                     }
4572                     scan->flags = (U8)*minnextp;
4573                 }
4574
4575                 *minnextp += min;
4576
4577                 if (f & SCF_DO_STCLASS_AND) {
4578                     const int was = (data->start_class->flags & ANYOF_EOS);
4579
4580                     cl_and(data->start_class, &intrnl);
4581                     if (was)
4582                         data->start_class->flags |= ANYOF_EOS;
4583                 }
4584                 if (data) {
4585                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4586                         pars++;
4587                     if (data_fake.flags & SF_HAS_EVAL)
4588                         data->flags |= SF_HAS_EVAL;
4589                     data->whilem_c = data_fake.whilem_c;
4590                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4591                         if (RExC_rx->minlen<*minnextp)
4592                             RExC_rx->minlen=*minnextp;
4593                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4594                         SvREFCNT_dec(data_fake.last_found);
4595
4596                         if ( data_fake.minlen_fixed != minlenp )
4597                         {
4598                             data->offset_fixed= data_fake.offset_fixed;
4599                             data->minlen_fixed= data_fake.minlen_fixed;
4600                             data->lookbehind_fixed+= scan->flags;
4601                         }
4602                         if ( data_fake.minlen_float != minlenp )
4603                         {
4604                             data->minlen_float= data_fake.minlen_float;
4605                             data->offset_float_min=data_fake.offset_float_min;
4606                             data->offset_float_max=data_fake.offset_float_max;
4607                             data->lookbehind_float+= scan->flags;
4608                         }
4609                     }
4610                 }
4611             }
4612 #endif
4613         }
4614         else if (OP(scan) == OPEN) {
4615             if (stopparen != (I32)ARG(scan))
4616                 pars++;
4617         }
4618         else if (OP(scan) == CLOSE) {
4619             if (stopparen == (I32)ARG(scan)) {
4620                 break;
4621             }
4622             if ((I32)ARG(scan) == is_par) {
4623                 next = regnext(scan);
4624
4625                 if ( next && (OP(next) != WHILEM) && next < last)
4626                     is_par = 0;         /* Disable optimization */
4627             }
4628             if (data)
4629                 *(data->last_closep) = ARG(scan);
4630         }
4631         else if (OP(scan) == EVAL) {
4632                 if (data)
4633                     data->flags |= SF_HAS_EVAL;
4634         }
4635         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4636             if (flags & SCF_DO_SUBSTR) {
4637                 SCAN_COMMIT(pRExC_state,data,minlenp);
4638                 flags &= ~SCF_DO_SUBSTR;
4639             }
4640             if (data && OP(scan)==ACCEPT) {
4641                 data->flags |= SCF_SEEN_ACCEPT;
4642                 if (stopmin > min)
4643                     stopmin = min;
4644             }
4645         }
4646         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4647         {
4648                 if (flags & SCF_DO_SUBSTR) {
4649                     SCAN_COMMIT(pRExC_state,data,minlenp);
4650                     data->longest = &(data->longest_float);
4651                 }
4652                 is_inf = is_inf_internal = 1;
4653                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4654                     cl_anything(pRExC_state, data->start_class);
4655                 flags &= ~SCF_DO_STCLASS;
4656         }
4657         else if (OP(scan) == GPOS) {
4658             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4659                 !(delta || is_inf || (data && data->pos_delta)))
4660             {
4661                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4662                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4663                 if (RExC_rx->gofs < (U32)min)
4664                     RExC_rx->gofs = min;
4665             } else {
4666                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4667                 RExC_rx->gofs = 0;
4668             }
4669         }
4670 #ifdef TRIE_STUDY_OPT
4671 #ifdef FULL_TRIE_STUDY
4672         else if (PL_regkind[OP(scan)] == TRIE) {
4673             /* NOTE - There is similar code to this block above for handling
4674                BRANCH nodes on the initial study.  If you change stuff here
4675                check there too. */
4676             regnode *trie_node= scan;
4677             regnode *tail= regnext(scan);
4678             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4679             I32 max1 = 0, min1 = I32_MAX;
4680             struct regnode_charclass_class accum;
4681
4682             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4683                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4684             if (flags & SCF_DO_STCLASS)
4685                 cl_init_zero(pRExC_state, &accum);
4686
4687             if (!trie->jump) {
4688                 min1= trie->minlen;
4689                 max1= trie->maxlen;
4690             } else {
4691                 const regnode *nextbranch= NULL;
4692                 U32 word;
4693
4694                 for ( word=1 ; word <= trie->wordcount ; word++)
4695                 {
4696                     I32 deltanext=0, minnext=0, f = 0, fake;
4697                     struct regnode_charclass_class this_class;
4698
4699                     data_fake.flags = 0;
4700                     if (data) {
4701                         data_fake.whilem_c = data->whilem_c;
4702                         data_fake.last_closep = data->last_closep;
4703                     }
4704                     else
4705                         data_fake.last_closep = &fake;
4706                     data_fake.pos_delta = delta;
4707                     if (flags & SCF_DO_STCLASS) {
4708                         cl_init(pRExC_state, &this_class);
4709                         data_fake.start_class = &this_class;
4710                         f = SCF_DO_STCLASS_AND;
4711                     }
4712                     if (flags & SCF_WHILEM_VISITED_POS)
4713                         f |= SCF_WHILEM_VISITED_POS;
4714
4715                     if (trie->jump[word]) {
4716                         if (!nextbranch)
4717                             nextbranch = trie_node + trie->jump[0];
4718                         scan= trie_node + trie->jump[word];
4719                         /* We go from the jump point to the branch that follows
4720                            it. Note this means we need the vestigal unused branches
4721                            even though they arent otherwise used.
4722                          */
4723                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4724                             &deltanext, (regnode *)nextbranch, &data_fake,
4725                             stopparen, recursed, NULL, f,depth+1);
4726                     }
4727                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4728                         nextbranch= regnext((regnode*)nextbranch);
4729
4730                     if (min1 > (I32)(minnext + trie->minlen))
4731                         min1 = minnext + trie->minlen;
4732                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4733                         max1 = minnext + deltanext + trie->maxlen;
4734                     if (deltanext == I32_MAX)
4735                         is_inf = is_inf_internal = 1;
4736
4737                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4738                         pars++;
4739                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4740                         if ( stopmin > min + min1)
4741                             stopmin = min + min1;
4742                         flags &= ~SCF_DO_SUBSTR;
4743                         if (data)
4744                             data->flags |= SCF_SEEN_ACCEPT;
4745                     }
4746                     if (data) {
4747                         if (data_fake.flags & SF_HAS_EVAL)
4748                             data->flags |= SF_HAS_EVAL;
4749                         data->whilem_c = data_fake.whilem_c;
4750                     }
4751                     if (flags & SCF_DO_STCLASS)
4752                         cl_or(pRExC_state, &accum, &this_class);
4753                 }
4754             }
4755             if (flags & SCF_DO_SUBSTR) {
4756                 data->pos_min += min1;
4757                 data->pos_delta += max1 - min1;
4758                 if (max1 != min1 || is_inf)
4759                     data->longest = &(data->longest_float);
4760             }
4761             min += min1;
4762             delta += max1 - min1;
4763             if (flags & SCF_DO_STCLASS_OR) {
4764                 cl_or(pRExC_state, data->start_class, &accum);
4765                 if (min1) {
4766                     cl_and(data->start_class, and_withp);
4767                     flags &= ~SCF_DO_STCLASS;
4768                 }
4769             }
4770             else if (flags & SCF_DO_STCLASS_AND) {
4771                 if (min1) {
4772                     cl_and(data->start_class, &accum);
4773                     flags &= ~SCF_DO_STCLASS;
4774                 }
4775                 else {
4776                     /* Switch to OR mode: cache the old value of
4777                      * data->start_class */
4778                     INIT_AND_WITHP;
4779                     StructCopy(data->start_class, and_withp,
4780                                struct regnode_charclass_class);
4781                     flags &= ~SCF_DO_STCLASS_AND;
4782                     StructCopy(&accum, data->start_class,
4783                                struct regnode_charclass_class);
4784                     flags |= SCF_DO_STCLASS_OR;
4785                     data->start_class->flags |= ANYOF_EOS;
4786                 }
4787             }
4788             scan= tail;
4789             continue;
4790         }
4791 #else
4792         else if (PL_regkind[OP(scan)] == TRIE) {
4793             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4794             U8*bang=NULL;
4795
4796             min += trie->minlen;
4797             delta += (trie->maxlen - trie->minlen);
4798             flags &= ~SCF_DO_STCLASS; /* xxx */
4799             if (flags & SCF_DO_SUBSTR) {
4800                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4801                 data->pos_min += trie->minlen;
4802                 data->pos_delta += (trie->maxlen - trie->minlen);
4803                 if (trie->maxlen != trie->minlen)
4804                     data->longest = &(data->longest_float);
4805             }
4806             if (trie->jump) /* no more substrings -- for now /grr*/
4807                 flags &= ~SCF_DO_SUBSTR;
4808         }
4809 #endif /* old or new */
4810 #endif /* TRIE_STUDY_OPT */
4811
4812         /* Else: zero-length, ignore. */
4813         scan = regnext(scan);
4814     }
4815     if (frame) {
4816         last = frame->last;
4817         scan = frame->next;
4818         stopparen = frame->stop;
4819         frame = frame->prev;
4820         goto fake_study_recurse;
4821     }
4822
4823   finish:
4824     assert(!frame);
4825     DEBUG_STUDYDATA("pre-fin:",data,depth);
4826
4827     *scanp = scan;
4828     *deltap = is_inf_internal ? I32_MAX : delta;
4829     if (flags & SCF_DO_SUBSTR && is_inf)
4830         data->pos_delta = I32_MAX - data->pos_min;
4831     if (is_par > (I32)U8_MAX)
4832         is_par = 0;
4833     if (is_par && pars==1 && data) {
4834         data->flags |= SF_IN_PAR;
4835         data->flags &= ~SF_HAS_PAR;
4836     }
4837     else if (pars && data) {
4838         data->flags |= SF_HAS_PAR;
4839         data->flags &= ~SF_IN_PAR;
4840     }
4841     if (flags & SCF_DO_STCLASS_OR)
4842         cl_and(data->start_class, and_withp);
4843     if (flags & SCF_TRIE_RESTUDY)
4844         data->flags |=  SCF_TRIE_RESTUDY;
4845
4846     DEBUG_STUDYDATA("post-fin:",data,depth);
4847
4848     return min < stopmin ? min : stopmin;
4849 }
4850
4851 STATIC U32
4852 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4853 {
4854     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4855
4856     PERL_ARGS_ASSERT_ADD_DATA;
4857
4858     Renewc(RExC_rxi->data,
4859            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4860            char, struct reg_data);
4861     if(count)
4862         Renew(RExC_rxi->data->what, count + n, U8);
4863     else
4864         Newx(RExC_rxi->data->what, n, U8);
4865     RExC_rxi->data->count = count + n;
4866     Copy(s, RExC_rxi->data->what + count, n, U8);
4867     return count;
4868 }
4869
4870 /*XXX: todo make this not included in a non debugging perl */
4871 #ifndef PERL_IN_XSUB_RE
4872 void
4873 Perl_reginitcolors(pTHX)
4874 {
4875     dVAR;
4876     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4877     if (s) {
4878         char *t = savepv(s);
4879         int i = 0;
4880         PL_colors[0] = t;
4881         while (++i < 6) {
4882             t = strchr(t, '\t');
4883             if (t) {
4884                 *t = '\0';
4885                 PL_colors[i] = ++t;
4886             }
4887             else
4888                 PL_colors[i] = t = (char *)"";
4889         }
4890     } else {
4891         int i = 0;
4892         while (i < 6)
4893             PL_colors[i++] = (char *)"";
4894     }
4895     PL_colorset = 1;
4896 }
4897 #endif
4898
4899
4900 #ifdef TRIE_STUDY_OPT
4901 #define CHECK_RESTUDY_GOTO                                  \
4902         if (                                                \
4903               (data.flags & SCF_TRIE_RESTUDY)               \
4904               && ! restudied++                              \
4905         )     goto reStudy
4906 #else
4907 #define CHECK_RESTUDY_GOTO
4908 #endif
4909
4910 /*
4911  * pregcomp - compile a regular expression into internal code
4912  *
4913  * Decides which engine's compiler to call based on the hint currently in
4914  * scope
4915  */
4916
4917 #ifndef PERL_IN_XSUB_RE
4918
4919 /* return the currently in-scope regex engine (or the default if none)  */
4920
4921 regexp_engine const *
4922 Perl_current_re_engine(pTHX)
4923 {
4924     dVAR;
4925
4926     if (IN_PERL_COMPILETIME) {
4927         HV * const table = GvHV(PL_hintgv);
4928         SV **ptr;
4929
4930         if (!table)
4931             return &PL_core_reg_engine;
4932         ptr = hv_fetchs(table, "regcomp", FALSE);
4933         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4934             return &PL_core_reg_engine;
4935         return INT2PTR(regexp_engine*,SvIV(*ptr));
4936     }
4937     else {
4938         SV *ptr;
4939         if (!PL_curcop->cop_hints_hash)
4940             return &PL_core_reg_engine;
4941         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4942         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4943             return &PL_core_reg_engine;
4944         return INT2PTR(regexp_engine*,SvIV(ptr));
4945     }
4946 }
4947
4948
4949 REGEXP *
4950 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4951 {
4952     dVAR;
4953     regexp_engine const *eng = current_re_engine();
4954     GET_RE_DEBUG_FLAGS_DECL;
4955
4956     PERL_ARGS_ASSERT_PREGCOMP;
4957
4958     /* Dispatch a request to compile a regexp to correct regexp engine. */
4959     DEBUG_COMPILE_r({
4960         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4961                         PTR2UV(eng));
4962     });
4963     return CALLREGCOMP_ENG(eng, pattern, flags);
4964 }
4965 #endif
4966
4967 /* public(ish) entry point for the perl core's own regex compiling code.
4968  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4969  * pattern rather than a list of OPs, and uses the internal engine rather
4970  * than the current one */
4971
4972 REGEXP *
4973 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4974 {
4975     SV *pat = pattern; /* defeat constness! */
4976     PERL_ARGS_ASSERT_RE_COMPILE;
4977     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4978 #ifdef PERL_IN_XSUB_RE
4979                                 &my_reg_engine,
4980 #else
4981                                 &PL_core_reg_engine,
4982 #endif
4983                                 NULL, NULL, rx_flags, 0);
4984 }
4985
4986 /* see if there are any run-time code blocks in the pattern.
4987  * False positives are allowed */
4988
4989 static bool
4990 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4991                     U32 pm_flags, char *pat, STRLEN plen)
4992 {
4993     int n = 0;
4994     STRLEN s;
4995
4996     /* avoid infinitely recursing when we recompile the pattern parcelled up
4997      * as qr'...'. A single constant qr// string can't have have any
4998      * run-time component in it, and thus, no runtime code. (A non-qr
4999      * string, however, can, e.g. $x =~ '(?{})') */
5000     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
5001         return 0;
5002
5003     for (s = 0; s < plen; s++) {
5004         if (n < pRExC_state->num_code_blocks
5005             && s == pRExC_state->code_blocks[n].start)
5006         {
5007             s = pRExC_state->code_blocks[n].end;
5008             n++;
5009             continue;
5010         }
5011         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5012          * positives here */
5013         if (pat[s] == '(' && pat[s+1] == '?' &&
5014             (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
5015         )
5016             return 1;
5017     }
5018     return 0;
5019 }
5020
5021 /* Handle run-time code blocks. We will already have compiled any direct
5022  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5023  * copy of it, but with any literal code blocks blanked out and
5024  * appropriate chars escaped; then feed it into
5025  *
5026  *    eval "qr'modified_pattern'"
5027  *
5028  * For example,
5029  *
5030  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5031  *
5032  * becomes
5033  *
5034  *    qr'a\\bc                       def\'ghi\\\\jkl(?{"this is runtime"})mno'
5035  *
5036  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5037  * and merge them with any code blocks of the original regexp.
5038  *
5039  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5040  * instead, just save the qr and return FALSE; this tells our caller that
5041  * the original pattern needs upgrading to utf8.
5042  */
5043
5044 static bool
5045 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5046     char *pat, STRLEN plen)
5047 {
5048     SV *qr;
5049
5050     GET_RE_DEBUG_FLAGS_DECL;
5051
5052     if (pRExC_state->runtime_code_qr) {
5053         /* this is the second time we've been called; this should
5054          * only happen if the main pattern got upgraded to utf8
5055          * during compilation; re-use the qr we compiled first time
5056          * round (which should be utf8 too)
5057          */
5058         qr = pRExC_state->runtime_code_qr;
5059         pRExC_state->runtime_code_qr = NULL;
5060         assert(RExC_utf8 && SvUTF8(qr));
5061     }
5062     else {
5063         int n = 0;
5064         STRLEN s;
5065         char *p, *newpat;
5066         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5067         SV *sv, *qr_ref;
5068         dSP;
5069
5070         /* determine how many extra chars we need for ' and \ escaping */
5071         for (s = 0; s < plen; s++) {
5072             if (pat[s] == '\'' || pat[s] == '\\')
5073                 newlen++;
5074         }
5075
5076         Newx(newpat, newlen, char);
5077         p = newpat;
5078         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5079
5080         for (s = 0; s < plen; s++) {
5081             if (n < pRExC_state->num_code_blocks
5082                 && s == pRExC_state->code_blocks[n].start)
5083             {
5084                 /* blank out literal code block */
5085                 assert(pat[s] == '(');
5086                 while (s <= pRExC_state->code_blocks[n].end) {
5087                     *p++ = ' ';
5088                     s++;
5089                 }
5090                 s--;
5091                 n++;
5092                 continue;
5093             }
5094             if (pat[s] == '\'' || pat[s] == '\\')
5095                 *p++ = '\\';
5096             *p++ = pat[s];
5097         }
5098         *p++ = '\'';
5099         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5100             *p++ = 'x';
5101         *p++ = '\0';
5102         DEBUG_COMPILE_r({
5103             PerlIO_printf(Perl_debug_log,
5104                 "%sre-parsing pattern for runtime code:%s %s\n",
5105                 PL_colors[4],PL_colors[5],newpat);
5106         });
5107
5108         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5109         Safefree(newpat);
5110
5111         ENTER;
5112         SAVETMPS;
5113         save_re_context();
5114         PUSHSTACKi(PERLSI_REQUIRE);
5115         /* this causes the toker to collapse \\ into \ when parsing
5116          * qr''; normally only q'' does this. It also alters hints
5117          * handling */
5118         PL_reg_state.re_reparsing = TRUE;
5119         eval_sv(sv, G_SCALAR);
5120         SvREFCNT_dec(sv);
5121         SPAGAIN;
5122         qr_ref = POPs;
5123         PUTBACK;
5124         if (SvTRUE(ERRSV))
5125             Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5126         assert(SvROK(qr_ref));
5127         qr = SvRV(qr_ref);
5128         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5129         /* the leaving below frees the tmp qr_ref.
5130          * Give qr a life of its own */
5131         SvREFCNT_inc(qr);
5132         POPSTACK;
5133         FREETMPS;
5134         LEAVE;
5135
5136     }
5137
5138     if (!RExC_utf8 && SvUTF8(qr)) {
5139         /* first time through; the pattern got upgraded; save the
5140          * qr for the next time through */
5141         assert(!pRExC_state->runtime_code_qr);
5142         pRExC_state->runtime_code_qr = qr;
5143         return 0;
5144     }
5145
5146
5147     /* extract any code blocks within the returned qr//  */
5148
5149
5150     /* merge the main (r1) and run-time (r2) code blocks into one */
5151     {
5152         RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5153         struct reg_code_block *new_block, *dst;
5154         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5155         int i1 = 0, i2 = 0;
5156
5157         if (!r2->num_code_blocks) /* we guessed wrong */
5158             return 1;
5159
5160         Newx(new_block,
5161             r1->num_code_blocks + r2->num_code_blocks,
5162             struct reg_code_block);
5163         dst = new_block;
5164
5165         while (    i1 < r1->num_code_blocks
5166                 || i2 < r2->num_code_blocks)
5167         {
5168             struct reg_code_block *src;
5169             bool is_qr = 0;
5170
5171             if (i1 == r1->num_code_blocks) {
5172                 src = &r2->code_blocks[i2++];
5173                 is_qr = 1;
5174             }
5175             else if (i2 == r2->num_code_blocks)
5176                 src = &r1->code_blocks[i1++];
5177             else if (  r1->code_blocks[i1].start
5178                      < r2->code_blocks[i2].start)
5179             {
5180                 src = &r1->code_blocks[i1++];
5181                 assert(src->end < r2->code_blocks[i2].start);
5182             }
5183             else {
5184                 assert(  r1->code_blocks[i1].start
5185                        > r2->code_blocks[i2].start);
5186                 src = &r2->code_blocks[i2++];
5187                 is_qr = 1;
5188                 assert(src->end < r1->code_blocks[i1].start);
5189             }
5190
5191             assert(pat[src->start] == '(');
5192             assert(pat[src->end]   == ')');
5193             dst->start      = src->start;
5194             dst->end        = src->end;
5195             dst->block      = src->block;
5196             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5197                                     : src->src_regex;
5198             dst++;
5199         }
5200         r1->num_code_blocks += r2->num_code_blocks;
5201         Safefree(r1->code_blocks);
5202         r1->code_blocks = new_block;
5203     }
5204
5205     SvREFCNT_dec(qr);
5206     return 1;
5207 }
5208
5209
5210 STATIC bool
5211 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5212 {
5213     /* This is the common code for setting up the floating and fixed length
5214      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5215      * as to whether succeeded or not */
5216
5217     I32 t,ml;
5218
5219     if (! (longest_length
5220            || (eol /* Can't have SEOL and MULTI */
5221                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5222           )
5223             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5224         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5225     {
5226         return FALSE;
5227     }
5228
5229     /* copy the information about the longest from the reg_scan_data
5230         over to the program. */
5231     if (SvUTF8(sv_longest)) {
5232         *rx_utf8 = sv_longest;
5233         *rx_substr = NULL;
5234     } else {
5235         *rx_substr = sv_longest;
5236         *rx_utf8 = NULL;
5237     }
5238     /* end_shift is how many chars that must be matched that
5239         follow this item. We calculate it ahead of time as once the
5240         lookbehind offset is added in we lose the ability to correctly
5241         calculate it.*/
5242     ml = minlen ? *(minlen) : (I32)longest_length;
5243     *rx_end_shift = ml - offset
5244         - longest_length + (SvTAIL(sv_longest) != 0)
5245         + lookbehind;
5246
5247     t = (eol/* Can't have SEOL and MULTI */
5248          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5249     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5250
5251     return TRUE;
5252 }
5253
5254 /*
5255  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5256  * regular expression into internal code.
5257  * The pattern may be passed either as:
5258  *    a list of SVs (patternp plus pat_count)
5259  *    a list of OPs (expr)
5260  * If both are passed, the SV list is used, but the OP list indicates
5261  * which SVs are actually pre-compiled code blocks
5262  *
5263  * The SVs in the list have magic and qr overloading applied to them (and
5264  * the list may be modified in-place with replacement SVs in the latter
5265  * case).
5266  *
5267  * If the pattern hasn't changed from old_re, then old_re will be
5268  * returned.
5269  *
5270  * eng is the current engine. If that engine has an op_comp method, then
5271  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5272  * do the initial concatenation of arguments and pass on to the external
5273  * engine.
5274  *
5275  * If is_bare_re is not null, set it to a boolean indicating whether the
5276  * arg list reduced (after overloading) to a single bare regex which has
5277  * been returned (i.e. /$qr/).
5278  *
5279  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5280  *
5281  * pm_flags contains the PMf_* flags, typically based on those from the
5282  * pm_flags field of the related PMOP. Currently we're only interested in
5283  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5284  *
5285  * We can't allocate space until we know how big the compiled form will be,
5286  * but we can't compile it (and thus know how big it is) until we've got a
5287  * place to put the code.  So we cheat:  we compile it twice, once with code
5288  * generation turned off and size counting turned on, and once "for real".
5289  * This also means that we don't allocate space until we are sure that the
5290  * thing really will compile successfully, and we never have to move the
5291  * code and thus invalidate pointers into it.  (Note that it has to be in
5292  * one piece because free() must be able to free it all.) [NB: not true in perl]
5293  *
5294  * Beware that the optimization-preparation code in here knows about some
5295  * of the structure of the compiled regexp.  [I'll say.]
5296  */
5297
5298 REGEXP *
5299 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5300                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5301                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5302 {
5303     dVAR;
5304     REGEXP *rx;
5305     struct regexp *r;
5306     regexp_internal *ri;
5307     STRLEN plen;
5308     char  * VOL exp;
5309     char* xend;
5310     regnode *scan;
5311     I32 flags;
5312     I32 minlen = 0;
5313     U32 rx_flags;
5314     SV * VOL pat;
5315
5316     /* these are all flags - maybe they should be turned
5317      * into a single int with different bit masks */
5318     I32 sawlookahead = 0;
5319     I32 sawplus = 0;
5320     I32 sawopen = 0;
5321     bool used_setjump = FALSE;
5322     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5323     bool code_is_utf8 = 0;
5324     bool VOL recompile = 0;
5325     bool runtime_code = 0;
5326     U8 jump_ret = 0;
5327     dJMPENV;
5328     scan_data_t data;
5329     RExC_state_t RExC_state;
5330     RExC_state_t * const pRExC_state = &RExC_state;
5331 #ifdef TRIE_STUDY_OPT
5332     int restudied;
5333     RExC_state_t copyRExC_state;
5334 #endif
5335     GET_RE_DEBUG_FLAGS_DECL;
5336
5337     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5338
5339     DEBUG_r(if (!PL_colorset) reginitcolors());
5340
5341 #ifndef PERL_IN_XSUB_RE
5342     /* Initialize these here instead of as-needed, as is quick and avoids
5343      * having to test them each time otherwise */
5344     if (! PL_AboveLatin1) {
5345         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5346         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5347         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5348
5349         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5350         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5351
5352         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5353         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5354
5355         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5356         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5357
5358         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5359
5360         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5361         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5362
5363         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5364
5365         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5366         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5367
5368         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5369         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5370
5371         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5372         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5373
5374         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5375         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5376
5377         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5378         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5379
5380         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5381         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5382
5383         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5384         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5385
5386         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5387
5388         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5389         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5390
5391         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5392         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5393     }
5394 #endif
5395
5396     pRExC_state->code_blocks = NULL;
5397     pRExC_state->num_code_blocks = 0;
5398
5399     if (is_bare_re)
5400         *is_bare_re = FALSE;
5401
5402     if (expr && (expr->op_type == OP_LIST ||
5403                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5404
5405         /* is the source UTF8, and how many code blocks are there? */
5406         OP *o;
5407         int ncode = 0;
5408
5409         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5410             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5411                 code_is_utf8 = 1;
5412             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5413                 /* count of DO blocks */
5414                 ncode++;
5415         }
5416         if (ncode) {
5417             pRExC_state->num_code_blocks = ncode;
5418             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5419         }
5420     }
5421
5422     if (pat_count) {
5423         /* handle a list of SVs */
5424
5425         SV **svp;
5426
5427         /* apply magic and RE overloading to each arg */
5428         for (svp = patternp; svp < patternp + pat_count; svp++) {
5429             SV *rx = *svp;
5430             SvGETMAGIC(rx);
5431             if (SvROK(rx) && SvAMAGIC(rx)) {
5432                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5433                 if (sv) {
5434                     if (SvROK(sv))
5435                         sv = SvRV(sv);
5436                     if (SvTYPE(sv) != SVt_REGEXP)
5437                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5438                     *svp = sv;
5439                 }
5440             }
5441         }
5442
5443         if (pat_count > 1) {
5444             /* concat multiple args and find any code block indexes */
5445
5446             OP *o = NULL;
5447             int n = 0;
5448             bool utf8 = 0;
5449             STRLEN orig_patlen = 0;
5450
5451             if (pRExC_state->num_code_blocks) {
5452                 o = cLISTOPx(expr)->op_first;
5453                 assert(o->op_type == OP_PUSHMARK);
5454                 o = o->op_sibling;
5455             }
5456
5457             pat = newSVpvn("", 0);
5458             SAVEFREESV(pat);
5459
5460             /* determine if the pattern is going to be utf8 (needed
5461              * in advance to align code block indices correctly).
5462              * XXX This could fail to be detected for an arg with
5463              * overloading but not concat overloading; but the main effect
5464              * in this obscure case is to need a 'use re eval' for a
5465              * literal code block */
5466             for (svp = patternp; svp < patternp + pat_count; svp++) {
5467                 if (SvUTF8(*svp))
5468                     utf8 = 1;
5469             }
5470             if (utf8)
5471                 SvUTF8_on(pat);
5472
5473             for (svp = patternp; svp < patternp + pat_count; svp++) {
5474                 SV *sv, *msv = *svp;
5475                 SV *rx;
5476                 bool code = 0;
5477                 if (o) {
5478                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5479                         assert(n < pRExC_state->num_code_blocks);
5480                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5481                         pRExC_state->code_blocks[n].block = o;
5482                         pRExC_state->code_blocks[n].src_regex = NULL;
5483                         n++;
5484                         code = 1;
5485                         o = o->op_sibling; /* skip CONST */
5486                         assert(o);
5487                     }
5488                     o = o->op_sibling;;
5489                 }
5490
5491                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5492                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5493                 {
5494                     sv_setsv(pat, sv);
5495                     /* overloading involved: all bets are off over literal
5496                      * code. Pretend we haven't seen it */
5497                     pRExC_state->num_code_blocks -= n;
5498                     n = 0;
5499                     rx = NULL;
5500
5501                 }
5502                 else  {
5503                     while (SvAMAGIC(msv)
5504                             && (sv = AMG_CALLunary(msv, string_amg))
5505                             && sv != msv)
5506                     {
5507                         msv = sv;
5508                         SvGETMAGIC(msv);
5509                     }
5510                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5511                         msv = SvRV(msv);
5512                     orig_patlen = SvCUR(pat);
5513                     sv_catsv_nomg(pat, msv);
5514                     rx = msv;
5515                     if (code)
5516                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5517                 }
5518
5519                 /* extract any code blocks within any embedded qr//'s */
5520                 if (rx && SvTYPE(rx) == SVt_REGEXP
5521                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5522                 {
5523
5524                     RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5525                     if (ri->num_code_blocks) {
5526                         int i;
5527                         /* the presence of an embedded qr// with code means
5528                          * we should always recompile: the text of the
5529                          * qr// may not have changed, but it may be a
5530                          * different closure than last time */
5531                         recompile = 1;
5532                         Renew(pRExC_state->code_blocks,
5533                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5534                             struct reg_code_block);
5535                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5536                         for (i=0; i < ri->num_code_blocks; i++) {
5537                             struct reg_code_block *src, *dst;
5538                             STRLEN offset =  orig_patlen
5539                                 + ((struct regexp *)SvANY(rx))->pre_prefix;
5540                             assert(n < pRExC_state->num_code_blocks);
5541                             src = &ri->code_blocks[i];
5542                             dst = &pRExC_state->code_blocks[n];
5543                             dst->start      = src->start + offset;
5544                             dst->end        = src->end   + offset;
5545                             dst->block      = src->block;
5546                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5547                                                     src->src_regex
5548                                                         ? src->src_regex
5549                                                         : (REGEXP*)rx);
5550                             n++;
5551                         }
5552                     }
5553                 }
5554             }
5555             SvSETMAGIC(pat);
5556         }
5557         else {
5558             SV *sv;
5559             pat = *patternp;
5560             while (SvAMAGIC(pat)
5561                     && (sv = AMG_CALLunary(pat, string_amg))
5562                     && sv != pat)
5563             {
5564                 pat = sv;
5565                 SvGETMAGIC(pat);
5566             }
5567         }
5568
5569         /* handle bare regex: foo =~ $re */
5570         {
5571             SV *re = pat;
5572             if (SvROK(re))
5573                 re = SvRV(re);
5574             if (SvTYPE(re) == SVt_REGEXP) {
5575                 if (is_bare_re)
5576                     *is_bare_re = TRUE;
5577                 SvREFCNT_inc(re);
5578                 Safefree(pRExC_state->code_blocks);
5579                 return (REGEXP*)re;
5580             }
5581         }
5582     }
5583     else {
5584         /* not a list of SVs, so must be a list of OPs */
5585         assert(expr);
5586         if (expr->op_type == OP_LIST) {
5587             int i = -1;
5588             bool is_code = 0;
5589             OP *o;
5590
5591             pat = newSVpvn("", 0);
5592             SAVEFREESV(pat);
5593             if (code_is_utf8)
5594                 SvUTF8_on(pat);
5595
5596             /* given a list of CONSTs and DO blocks in expr, append all
5597              * the CONSTs to pat, and record the start and end of each
5598              * code block in code_blocks[] (each DO{} op is followed by an
5599              * OP_CONST containing the corresponding literal '(?{...})
5600              * text)
5601              */
5602             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5603                 if (o->op_type == OP_CONST) {
5604                     sv_catsv(pat, cSVOPo_sv);
5605                     if (is_code) {
5606                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5607                         is_code = 0;
5608                     }
5609                 }
5610                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5611                     assert(i+1 < pRExC_state->num_code_blocks);
5612                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5613                     pRExC_state->code_blocks[i].block = o;
5614                     pRExC_state->code_blocks[i].src_regex = NULL;
5615                     is_code = 1;
5616                 }
5617             }
5618         }
5619         else {
5620             assert(expr->op_type == OP_CONST);
5621             pat = cSVOPx_sv(expr);
5622         }
5623     }
5624
5625     exp = SvPV_nomg(pat, plen);
5626
5627     if (!eng->op_comp) {
5628         if ((SvUTF8(pat) && IN_BYTES)
5629                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5630         {
5631             /* make a temporary copy; either to convert to bytes,
5632              * or to avoid repeating get-magic / overloaded stringify */
5633             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5634                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5635         }
5636         Safefree(pRExC_state->code_blocks);
5637         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5638     }
5639
5640     /* ignore the utf8ness if the pattern is 0 length */
5641     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5642     RExC_uni_semantics = 0;
5643     RExC_contains_locale = 0;
5644     pRExC_state->runtime_code_qr = NULL;
5645
5646     /****************** LONG JUMP TARGET HERE***********************/
5647     /* Longjmp back to here if have to switch in midstream to utf8 */
5648     if (! RExC_orig_utf8) {
5649         JMPENV_PUSH(jump_ret);
5650         used_setjump = TRUE;
5651     }
5652
5653     if (jump_ret == 0) {    /* First time through */
5654         xend = exp + plen;
5655
5656         DEBUG_COMPILE_r({
5657             SV *dsv= sv_newmortal();
5658             RE_PV_QUOTED_DECL(s, RExC_utf8,
5659                 dsv, exp, plen, 60);
5660             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5661                            PL_colors[4],PL_colors[5],s);
5662         });
5663     }
5664     else {  /* longjumped back */
5665         U8 *src, *dst;
5666         int n=0;
5667         STRLEN s = 0, d = 0;
5668         bool do_end = 0;
5669
5670         /* If the cause for the longjmp was other than changing to utf8, pop
5671          * our own setjmp, and longjmp to the correct handler */
5672         if (jump_ret != UTF8_LONGJMP) {
5673             JMPENV_POP;
5674             JMPENV_JUMP(jump_ret);
5675         }
5676
5677         GET_RE_DEBUG_FLAGS;
5678
5679         /* It's possible to write a regexp in ascii that represents Unicode
5680         codepoints outside of the byte range, such as via \x{100}. If we
5681         detect such a sequence we have to convert the entire pattern to utf8
5682         and then recompile, as our sizing calculation will have been based
5683         on 1 byte == 1 character, but we will need to use utf8 to encode
5684         at least some part of the pattern, and therefore must convert the whole
5685         thing.
5686         -- dmq */
5687         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5688             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5689
5690         /* upgrade pattern to UTF8, and if there are code blocks,
5691          * recalculate the indices.
5692          * This is essentially an unrolled Perl_bytes_to_utf8() */
5693
5694         src = (U8*)SvPV_nomg(pat, plen);
5695         Newx(dst, plen * 2 + 1, U8);
5696
5697         while (s < plen) {
5698             const UV uv = NATIVE_TO_ASCII(src[s]);
5699             if (UNI_IS_INVARIANT(uv))
5700                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5701             else {
5702                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5703                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5704             }
5705             if (n < pRExC_state->num_code_blocks) {
5706                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5707                     pRExC_state->code_blocks[n].start = d;
5708                     assert(dst[d] == '(');
5709                     do_end = 1;
5710                 }
5711                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5712                     pRExC_state->code_blocks[n].end = d;
5713                     assert(dst[d] == ')');
5714                     do_end = 0;
5715                     n++;
5716                 }
5717             }
5718             s++;
5719             d++;
5720         }
5721         dst[d] = '\0';
5722         plen = d;
5723         exp = (char*) dst;
5724         xend = exp + plen;
5725         SAVEFREEPV(exp);
5726         RExC_orig_utf8 = RExC_utf8 = 1;
5727     }
5728
5729     /* return old regex if pattern hasn't changed */
5730
5731     if (   old_re
5732         && !recompile
5733         && !!RX_UTF8(old_re) == !!RExC_utf8
5734         && RX_PRECOMP(old_re)
5735         && RX_PRELEN(old_re) == plen
5736         && memEQ(RX_PRECOMP(old_re), exp, plen))
5737     {
5738         /* with runtime code, always recompile */
5739         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5740                                             exp, plen);
5741         if (!runtime_code) {
5742             if (used_setjump) {
5743                 JMPENV_POP;
5744             }
5745             Safefree(pRExC_state->code_blocks);
5746             return old_re;
5747         }
5748     }
5749     else if ((pm_flags & PMf_USE_RE_EVAL)
5750                 /* this second condition covers the non-regex literal case,
5751                  * i.e.  $foo =~ '(?{})'. */
5752                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5753                     && (PL_hints & HINT_RE_EVAL))
5754     )
5755         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5756                             exp, plen);
5757
5758 #ifdef TRIE_STUDY_OPT
5759     restudied = 0;
5760 #endif
5761
5762     rx_flags = orig_rx_flags;
5763
5764     if (initial_charset == REGEX_LOCALE_CHARSET) {
5765         RExC_contains_locale = 1;
5766     }
5767     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5768
5769         /* Set to use unicode semantics if the pattern is in utf8 and has the
5770          * 'depends' charset specified, as it means unicode when utf8  */
5771         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5772     }
5773
5774     RExC_precomp = exp;
5775     RExC_flags = rx_flags;
5776     RExC_pm_flags = pm_flags;
5777
5778     if (runtime_code) {
5779         if (PL_tainting && PL_tainted)
5780             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5781
5782         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5783             /* whoops, we have a non-utf8 pattern, whilst run-time code
5784              * got compiled as utf8. Try again with a utf8 pattern */
5785              JMPENV_JUMP(UTF8_LONGJMP);
5786         }
5787     }
5788     assert(!pRExC_state->runtime_code_qr);
5789
5790     RExC_sawback = 0;
5791
5792     RExC_seen = 0;
5793     RExC_in_lookbehind = 0;
5794     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5795     RExC_extralen = 0;
5796     RExC_override_recoding = 0;
5797
5798     /* First pass: determine size, legality. */
5799     RExC_parse = exp;
5800     RExC_start = exp;
5801     RExC_end = xend;
5802     RExC_naughty = 0;
5803     RExC_npar = 1;
5804     RExC_nestroot = 0;
5805     RExC_size = 0L;
5806     RExC_emit = &PL_regdummy;
5807     RExC_whilem_seen = 0;
5808     RExC_open_parens = NULL;
5809     RExC_close_parens = NULL;
5810     RExC_opend = NULL;
5811     RExC_paren_names = NULL;
5812 #ifdef DEBUGGING
5813     RExC_paren_name_list = NULL;
5814 #endif
5815     RExC_recurse = NULL;
5816     RExC_recurse_count = 0;
5817     pRExC_state->code_index = 0;
5818
5819 #if 0 /* REGC() is (currently) a NOP at the first pass.
5820        * Clever compilers notice this and complain. --jhi */
5821     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5822 #endif
5823     DEBUG_PARSE_r(
5824         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5825         RExC_lastnum=0;
5826         RExC_lastparse=NULL;
5827     );
5828     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5829         RExC_precomp = NULL;
5830         Safefree(pRExC_state->code_blocks);
5831         return(NULL);
5832     }
5833
5834     /* Here, finished first pass.  Get rid of any added setjmp */
5835     if (used_setjump) {
5836         JMPENV_POP;
5837     }
5838
5839     DEBUG_PARSE_r({
5840         PerlIO_printf(Perl_debug_log,
5841             "Required size %"IVdf" nodes\n"
5842             "Starting second pass (creation)\n",
5843             (IV)RExC_size);
5844         RExC_lastnum=0;
5845         RExC_lastparse=NULL;
5846     });
5847
5848     /* The first pass could have found things that force Unicode semantics */
5849     if ((RExC_utf8 || RExC_uni_semantics)
5850          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5851     {
5852         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5853     }
5854
5855     /* Small enough for pointer-storage convention?
5856        If extralen==0, this means that we will not need long jumps. */
5857     if (RExC_size >= 0x10000L && RExC_extralen)
5858         RExC_size += RExC_extralen;
5859     else
5860         RExC_extralen = 0;
5861     if (RExC_whilem_seen > 15)
5862         RExC_whilem_seen = 15;
5863
5864     /* Allocate space and zero-initialize. Note, the two step process
5865        of zeroing when in debug mode, thus anything assigned has to
5866        happen after that */
5867     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5868     r = (struct regexp*)SvANY(rx);
5869     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5870          char, regexp_internal);
5871     if ( r == NULL || ri == NULL )
5872         FAIL("Regexp out of space");
5873 #ifdef DEBUGGING
5874     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5875     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5876 #else
5877     /* bulk initialize base fields with 0. */
5878     Zero(ri, sizeof(regexp_internal), char);
5879 #endif
5880
5881     /* non-zero initialization begins here */
5882     RXi_SET( r, ri );
5883     r->engine= eng;
5884     r->extflags = rx_flags;
5885     if (pm_flags & PMf_IS_QR) {
5886         ri->code_blocks = pRExC_state->code_blocks;
5887         ri->num_code_blocks = pRExC_state->num_code_blocks;
5888     }
5889     else
5890         SAVEFREEPV(pRExC_state->code_blocks);
5891
5892     {
5893         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5894         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5895
5896         /* The caret is output if there are any defaults: if not all the STD
5897          * flags are set, or if no character set specifier is needed */
5898         bool has_default =
5899                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5900                     || ! has_charset);
5901         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5902         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5903                             >> RXf_PMf_STD_PMMOD_SHIFT);
5904         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5905         char *p;
5906         /* Allocate for the worst case, which is all the std flags are turned
5907          * on.  If more precision is desired, we could do a population count of
5908          * the flags set.  This could be done with a small lookup table, or by
5909          * shifting, masking and adding, or even, when available, assembly
5910          * language for a machine-language population count.
5911          * We never output a minus, as all those are defaults, so are
5912          * covered by the caret */
5913         const STRLEN wraplen = plen + has_p + has_runon
5914             + has_default       /* If needs a caret */
5915
5916                 /* If needs a character set specifier */
5917             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5918             + (sizeof(STD_PAT_MODS) - 1)
5919             + (sizeof("(?:)") - 1);
5920
5921         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5922         SvPOK_on(rx);
5923         if (RExC_utf8)
5924             SvFLAGS(rx) |= SVf_UTF8;
5925         *p++='('; *p++='?';
5926
5927         /* If a default, cover it using the caret */
5928         if (has_default) {
5929             *p++= DEFAULT_PAT_MOD;
5930         }
5931         if (has_charset) {
5932             STRLEN len;
5933             const char* const name = get_regex_charset_name(r->extflags, &len);
5934             Copy(name, p, len, char);
5935             p += len;
5936         }
5937         if (has_p)
5938             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5939         {
5940             char ch;
5941             while((ch = *fptr++)) {
5942                 if(reganch & 1)
5943                     *p++ = ch;
5944                 reganch >>= 1;
5945             }
5946         }
5947
5948         *p++ = ':';
5949         Copy(RExC_precomp, p, plen, char);
5950         assert ((RX_WRAPPED(rx) - p) < 16);
5951         r->pre_prefix = p - RX_WRAPPED(rx);
5952         p += plen;
5953         if (has_runon)
5954             *p++ = '\n';
5955         *p++ = ')';
5956         *p = 0;
5957         SvCUR_set(rx, p - SvPVX_const(rx));
5958     }
5959
5960     r->intflags = 0;
5961     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5962
5963     if (RExC_seen & REG_SEEN_RECURSE) {
5964         Newxz(RExC_open_parens, RExC_npar,regnode *);
5965         SAVEFREEPV(RExC_open_parens);
5966         Newxz(RExC_close_parens,RExC_npar,regnode *);
5967         SAVEFREEPV(RExC_close_parens);
5968     }
5969
5970     /* Useful during FAIL. */
5971 #ifdef RE_TRACK_PATTERN_OFFSETS
5972     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5973     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5974                           "%s %"UVuf" bytes for offset annotations.\n",
5975                           ri->u.offsets ? "Got" : "Couldn't get",
5976                           (UV)((2*RExC_size+1) * sizeof(U32))));
5977 #endif
5978     SetProgLen(ri,RExC_size);
5979     RExC_rx_sv = rx;
5980     RExC_rx = r;
5981     RExC_rxi = ri;
5982
5983     /* Second pass: emit code. */
5984     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5985     RExC_pm_flags = pm_flags;
5986     RExC_parse = exp;
5987     RExC_end = xend;
5988     RExC_naughty = 0;
5989     RExC_npar = 1;
5990     RExC_emit_start = ri->program;
5991     RExC_emit = ri->program;
5992     RExC_emit_bound = ri->program + RExC_size + 1;
5993     pRExC_state->code_index = 0;
5994
5995     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5996     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5997         ReREFCNT_dec(rx);
5998         return(NULL);
5999     }
6000     /* XXXX To minimize changes to RE engine we always allocate
6001        3-units-long substrs field. */
6002     Newx(r->substrs, 1, struct reg_substr_data);
6003     if (RExC_recurse_count) {
6004         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6005         SAVEFREEPV(RExC_recurse);
6006     }
6007
6008 reStudy:
6009     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6010     Zero(r->substrs, 1, struct reg_substr_data);
6011
6012 #ifdef TRIE_STUDY_OPT
6013     if (!restudied) {
6014         StructCopy(&zero_scan_data, &data, scan_data_t);
6015         copyRExC_state = RExC_state;
6016     } else {
6017         U32 seen=RExC_seen;
6018         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6019
6020         RExC_state = copyRExC_state;
6021         if (seen & REG_TOP_LEVEL_BRANCHES)
6022             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6023         else
6024             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6025         if (data.last_found) {
6026             SvREFCNT_dec(data.longest_fixed);
6027             SvREFCNT_dec(data.longest_float);
6028             SvREFCNT_dec(data.last_found);
6029         }
6030         StructCopy(&zero_scan_data, &data, scan_data_t);
6031     }
6032 #else
6033     StructCopy(&zero_scan_data, &data, scan_data_t);
6034 #endif
6035
6036     /* Dig out information for optimizations. */
6037     r->extflags = RExC_flags; /* was pm_op */
6038     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6039
6040     if (UTF)
6041         SvUTF8_on(rx);  /* Unicode in it? */
6042     ri->regstclass = NULL;
6043     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6044         r->intflags |= PREGf_NAUGHTY;
6045     scan = ri->program + 1;             /* First BRANCH. */
6046
6047     /* testing for BRANCH here tells us whether there is "must appear"
6048        data in the pattern. If there is then we can use it for optimisations */
6049     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6050         I32 fake;
6051         STRLEN longest_float_length, longest_fixed_length;
6052         struct regnode_charclass_class ch_class; /* pointed to by data */
6053         int stclass_flag;
6054         I32 last_close = 0; /* pointed to by data */
6055         regnode *first= scan;
6056         regnode *first_next= regnext(first);
6057         /*
6058          * Skip introductions and multiplicators >= 1
6059          * so that we can extract the 'meat' of the pattern that must
6060          * match in the large if() sequence following.
6061          * NOTE that EXACT is NOT covered here, as it is normally
6062          * picked up by the optimiser separately.
6063          *
6064          * This is unfortunate as the optimiser isnt handling lookahead
6065          * properly currently.
6066          *
6067          */
6068         while ((OP(first) == OPEN && (sawopen = 1)) ||
6069                /* An OR of *one* alternative - should not happen now. */
6070             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6071             /* for now we can't handle lookbehind IFMATCH*/
6072             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6073             (OP(first) == PLUS) ||
6074             (OP(first) == MINMOD) ||
6075                /* An {n,m} with n>0 */
6076             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6077             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6078         {
6079                 /*
6080                  * the only op that could be a regnode is PLUS, all the rest
6081                  * will be regnode_1 or regnode_2.
6082                  *
6083                  */
6084                 if (OP(first) == PLUS)
6085                     sawplus = 1;
6086                 else
6087                     first += regarglen[OP(first)];
6088
6089                 first = NEXTOPER(first);
6090                 first_next= regnext(first);
6091         }
6092
6093         /* Starting-point info. */
6094       again:
6095         DEBUG_PEEP("first:",first,0);
6096         /* Ignore EXACT as we deal with it later. */
6097         if (PL_regkind[OP(first)] == EXACT) {
6098             if (OP(first) == EXACT)
6099                 NOOP;   /* Empty, get anchored substr later. */
6100             else
6101                 ri->regstclass = first;
6102         }
6103 #ifdef TRIE_STCLASS
6104         else if (PL_regkind[OP(first)] == TRIE &&
6105                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6106         {
6107             regnode *trie_op;
6108             /* this can happen only on restudy */
6109             if ( OP(first) == TRIE ) {
6110                 struct regnode_1 *trieop = (struct regnode_1 *)
6111                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6112                 StructCopy(first,trieop,struct regnode_1);
6113                 trie_op=(regnode *)trieop;
6114             } else {
6115                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6116                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6117                 StructCopy(first,trieop,struct regnode_charclass);
6118                 trie_op=(regnode *)trieop;
6119             }
6120             OP(trie_op)+=2;
6121             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6122             ri->regstclass = trie_op;
6123         }
6124 #endif
6125         else if (REGNODE_SIMPLE(OP(first)))
6126             ri->regstclass = first;
6127         else if (PL_regkind[OP(first)] == BOUND ||
6128                  PL_regkind[OP(first)] == NBOUND)
6129             ri->regstclass = first;
6130         else if (PL_regkind[OP(first)] == BOL) {
6131             r->extflags |= (OP(first) == MBOL
6132                            ? RXf_ANCH_MBOL
6133                            : (OP(first) == SBOL
6134                               ? RXf_ANCH_SBOL
6135                               : RXf_ANCH_BOL));
6136             first = NEXTOPER(first);
6137             goto again;
6138         }
6139         else if (OP(first) == GPOS) {
6140             r->extflags |= RXf_ANCH_GPOS;
6141             first = NEXTOPER(first);
6142             goto again;
6143         }
6144         else if ((!sawopen || !RExC_sawback) &&
6145             (OP(first) == STAR &&
6146             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6147             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6148         {
6149             /* turn .* into ^.* with an implied $*=1 */
6150             const int type =
6151                 (OP(NEXTOPER(first)) == REG_ANY)
6152                     ? RXf_ANCH_MBOL
6153                     : RXf_ANCH_SBOL;
6154             r->extflags |= type;
6155             r->intflags |= PREGf_IMPLICIT;
6156             first = NEXTOPER(first);
6157             goto again;
6158         }
6159         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6160             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6161             /* x+ must match at the 1st pos of run of x's */
6162             r->intflags |= PREGf_SKIP;
6163
6164         /* Scan is after the zeroth branch, first is atomic matcher. */
6165 #ifdef TRIE_STUDY_OPT
6166         DEBUG_PARSE_r(
6167             if (!restudied)
6168                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6169                               (IV)(first - scan + 1))
6170         );
6171 #else
6172         DEBUG_PARSE_r(
6173             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6174                 (IV)(first - scan + 1))
6175         );
6176 #endif
6177
6178
6179         /*
6180         * If there's something expensive in the r.e., find the
6181         * longest literal string that must appear and make it the
6182         * regmust.  Resolve ties in favor of later strings, since
6183         * the regstart check works with the beginning of the r.e.
6184         * and avoiding duplication strengthens checking.  Not a
6185         * strong reason, but sufficient in the absence of others.
6186         * [Now we resolve ties in favor of the earlier string if
6187         * it happens that c_offset_min has been invalidated, since the
6188         * earlier string may buy us something the later one won't.]
6189         */
6190
6191         data.longest_fixed = newSVpvs("");
6192         data.longest_float = newSVpvs("");
6193         data.last_found = newSVpvs("");
6194         data.longest = &(data.longest_fixed);
6195         first = scan;
6196         if (!ri->regstclass) {
6197             cl_init(pRExC_state, &ch_class);
6198             data.start_class = &ch_class;
6199             stclass_flag = SCF_DO_STCLASS_AND;
6200         } else                          /* XXXX Check for BOUND? */
6201             stclass_flag = 0;
6202         data.last_closep = &last_close;
6203
6204         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6205             &data, -1, NULL, NULL,
6206             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6207
6208
6209         CHECK_RESTUDY_GOTO;
6210
6211
6212         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6213              && data.last_start_min == 0 && data.last_end > 0
6214              && !RExC_seen_zerolen
6215              && !(RExC_seen & REG_SEEN_VERBARG)
6216              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6217             r->extflags |= RXf_CHECK_ALL;
6218         scan_commit(pRExC_state, &data,&minlen,0);
6219         SvREFCNT_dec(data.last_found);
6220
6221         longest_float_length = CHR_SVLEN(data.longest_float);
6222
6223         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6224                    && data.offset_fixed == data.offset_float_min
6225                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6226             && S_setup_longest (aTHX_ pRExC_state,
6227                                     data.longest_float,
6228                                     &(r->float_utf8),
6229                                     &(r->float_substr),
6230                                     &(r->float_end_shift),
6231                                     data.lookbehind_float,
6232                                     data.offset_float_min,
6233                                     data.minlen_float,
6234                                     longest_float_length,
6235                                     data.flags & SF_FL_BEFORE_EOL,
6236                                     data.flags & SF_FL_BEFORE_MEOL))
6237         {
6238             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6239             r->float_max_offset = data.offset_float_max;
6240             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6241                 r->float_max_offset -= data.lookbehind_float;
6242         }
6243         else {
6244             r->float_substr = r->float_utf8 = NULL;
6245             SvREFCNT_dec(data.longest_float);
6246             longest_float_length = 0;
6247         }
6248
6249         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6250
6251         if (S_setup_longest (aTHX_ pRExC_state,
6252                                 data.longest_fixed,
6253                                 &(r->anchored_utf8),
6254                                 &(r->anchored_substr),
6255                                 &(r->anchored_end_shift),
6256                                 data.lookbehind_fixed,
6257                                 data.offset_fixed,
6258                                 data.minlen_fixed,
6259                                 longest_fixed_length,
6260                                 data.flags & SF_FIX_BEFORE_EOL,
6261                                 data.flags & SF_FIX_BEFORE_MEOL))
6262         {
6263             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6264         }
6265         else {
6266             r->anchored_substr = r->anchored_utf8 = NULL;
6267             SvREFCNT_dec(data.longest_fixed);
6268             longest_fixed_length = 0;
6269         }
6270
6271         if (ri->regstclass
6272             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6273             ri->regstclass = NULL;
6274
6275         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6276             && stclass_flag
6277             && !(data.start_class->flags & ANYOF_EOS)
6278             && !cl_is_anything(data.start_class))
6279         {
6280             const U32 n = add_data(pRExC_state, 1, "f");
6281             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6282
6283             Newx(RExC_rxi->data->data[n], 1,
6284                 struct regnode_charclass_class);
6285             StructCopy(data.start_class,
6286                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6287                        struct regnode_charclass_class);
6288             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6289             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6290             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6291                       regprop(r, sv, (regnode*)data.start_class);
6292                       PerlIO_printf(Perl_debug_log,
6293                                     "synthetic stclass \"%s\".\n",
6294                                     SvPVX_const(sv));});
6295         }
6296
6297         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6298         if (longest_fixed_length > longest_float_length) {
6299             r->check_end_shift = r->anchored_end_shift;
6300             r->check_substr = r->anchored_substr;
6301             r->check_utf8 = r->anchored_utf8;
6302             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6303             if (r->extflags & RXf_ANCH_SINGLE)
6304                 r->extflags |= RXf_NOSCAN;
6305         }
6306         else {
6307             r->check_end_shift = r->float_end_shift;
6308             r->check_substr = r->float_substr;
6309             r->check_utf8 = r->float_utf8;
6310             r->check_offset_min = r->float_min_offset;
6311             r->check_offset_max = r->float_max_offset;
6312         }
6313         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6314            This should be changed ASAP!  */
6315         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6316             r->extflags |= RXf_USE_INTUIT;
6317             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6318                 r->extflags |= RXf_INTUIT_TAIL;
6319         }
6320         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6321         if ( (STRLEN)minlen < longest_float_length )
6322             minlen= longest_float_length;
6323         if ( (STRLEN)minlen < longest_fixed_length )
6324             minlen= longest_fixed_length;
6325         */
6326     }
6327     else {
6328         /* Several toplevels. Best we can is to set minlen. */
6329         I32 fake;
6330         struct regnode_charclass_class ch_class;
6331         I32 last_close = 0;
6332
6333         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6334
6335         scan = ri->program + 1;
6336         cl_init(pRExC_state, &ch_class);
6337         data.start_class = &ch_class;
6338         data.last_closep = &last_close;
6339
6340
6341         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6342             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6343
6344         CHECK_RESTUDY_GOTO;
6345
6346         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6347                 = r->float_substr = r->float_utf8 = NULL;
6348
6349         if (!(data.start_class->flags & ANYOF_EOS)
6350             && !cl_is_anything(data.start_class))
6351         {
6352             const U32 n = add_data(pRExC_state, 1, "f");
6353             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6354
6355             Newx(RExC_rxi->data->data[n], 1,
6356                 struct regnode_charclass_class);
6357             StructCopy(data.start_class,
6358                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6359                        struct regnode_charclass_class);
6360             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6361             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6362             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6363                       regprop(r, sv, (regnode*)data.start_class);
6364                       PerlIO_printf(Perl_debug_log,
6365                                     "synthetic stclass \"%s\".\n",
6366                                     SvPVX_const(sv));});
6367         }
6368     }
6369
6370     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6371        the "real" pattern. */
6372     DEBUG_OPTIMISE_r({
6373         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6374                       (IV)minlen, (IV)r->minlen);
6375     });
6376     r->minlenret = minlen;
6377     if (r->minlen < minlen)
6378         r->minlen = minlen;
6379
6380     if (RExC_seen & REG_SEEN_GPOS)
6381         r->extflags |= RXf_GPOS_SEEN;
6382     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6383         r->extflags |= RXf_LOOKBEHIND_SEEN;
6384     if (pRExC_state->num_code_blocks)
6385         r->extflags |= RXf_EVAL_SEEN;
6386     if (RExC_seen & REG_SEEN_CANY)
6387         r->extflags |= RXf_CANY_SEEN;
6388     if (RExC_seen & REG_SEEN_VERBARG)
6389         r->intflags |= PREGf_VERBARG_SEEN;
6390     if (RExC_seen & REG_SEEN_CUTGROUP)
6391         r->intflags |= PREGf_CUTGROUP_SEEN;
6392     if (pm_flags & PMf_USE_RE_EVAL)
6393         r->intflags |= PREGf_USE_RE_EVAL;
6394     if (RExC_paren_names)
6395         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6396     else
6397         RXp_PAREN_NAMES(r) = NULL;
6398
6399 #ifdef STUPID_PATTERN_CHECKS
6400     if (RX_PRELEN(rx) == 0)
6401         r->extflags |= RXf_NULL;
6402     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6403         /* XXX: this should happen BEFORE we compile */
6404         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6405     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6406         r->extflags |= RXf_WHITE;
6407     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6408         r->extflags |= RXf_START_ONLY;
6409 #else
6410     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6411             /* XXX: this should happen BEFORE we compile */
6412             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6413     else {
6414         regnode *first = ri->program + 1;
6415         U8 fop = OP(first);
6416
6417         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6418             r->extflags |= RXf_NULL;
6419         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6420             r->extflags |= RXf_START_ONLY;
6421         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6422                              && OP(regnext(first)) == END)
6423             r->extflags |= RXf_WHITE;
6424     }
6425 #endif
6426 #ifdef DEBUGGING
6427     if (RExC_paren_names) {
6428         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6429         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6430     } else
6431 #endif
6432         ri->name_list_idx = 0;
6433
6434     if (RExC_recurse_count) {
6435         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6436             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6437             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6438         }
6439     }
6440     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6441     /* assume we don't need to swap parens around before we match */
6442
6443     DEBUG_DUMP_r({
6444         PerlIO_printf(Perl_debug_log,"Final program:\n");
6445         regdump(r);
6446     });
6447 #ifdef RE_TRACK_PATTERN_OFFSETS
6448     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6449         const U32 len = ri->u.offsets[0];
6450         U32 i;
6451         GET_RE_DEBUG_FLAGS_DECL;
6452         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6453         for (i = 1; i <= len; i++) {
6454             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6455                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6456                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6457             }
6458         PerlIO_printf(Perl_debug_log, "\n");
6459     });
6460 #endif
6461     return rx;
6462 }
6463
6464
6465 SV*
6466 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6467                     const U32 flags)
6468 {
6469     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6470
6471     PERL_UNUSED_ARG(value);
6472
6473     if (flags & RXapif_FETCH) {
6474         return reg_named_buff_fetch(rx, key, flags);
6475     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6476         Perl_croak_no_modify(aTHX);
6477         return NULL;
6478     } else if (flags & RXapif_EXISTS) {
6479         return reg_named_buff_exists(rx, key, flags)
6480             ? &PL_sv_yes
6481             : &PL_sv_no;
6482     } else if (flags & RXapif_REGNAMES) {
6483         return reg_named_buff_all(rx, flags);
6484     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6485         return reg_named_buff_scalar(rx, flags);
6486     } else {
6487         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6488         return NULL;
6489     }
6490 }
6491
6492 SV*
6493 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6494                          const U32 flags)
6495 {
6496     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6497     PERL_UNUSED_ARG(lastkey);
6498
6499     if (flags & RXapif_FIRSTKEY)
6500         return reg_named_buff_firstkey(rx, flags);
6501     else if (flags & RXapif_NEXTKEY)
6502         return reg_named_buff_nextkey(rx, flags);
6503     else {
6504         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6505         return NULL;
6506     }
6507 }
6508
6509 SV*
6510 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6511                           const U32 flags)
6512 {
6513     AV *retarray = NULL;
6514     SV *ret;
6515     struct regexp *const rx = (struct regexp *)SvANY(r);
6516
6517     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6518
6519     if (flags & RXapif_ALL)
6520         retarray=newAV();
6521
6522     if (rx && RXp_PAREN_NAMES(rx)) {
6523         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6524         if (he_str) {
6525             IV i;
6526             SV* sv_dat=HeVAL(he_str);
6527             I32 *nums=(I32*)SvPVX(sv_dat);
6528             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6529                 if ((I32)(rx->nparens) >= nums[i]
6530                     && rx->offs[nums[i]].start != -1
6531                     && rx->offs[nums[i]].end != -1)
6532                 {
6533                     ret = newSVpvs("");
6534                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6535                     if (!retarray)
6536                         return ret;
6537                 } else {
6538                     if (retarray)
6539                         ret = newSVsv(&PL_sv_undef);
6540                 }
6541                 if (retarray)
6542                     av_push(retarray, ret);
6543             }
6544             if (retarray)
6545                 return newRV_noinc(MUTABLE_SV(retarray));
6546         }
6547     }
6548     return NULL;
6549 }
6550
6551 bool
6552 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6553                            const U32 flags)
6554 {
6555     struct regexp *const rx = (struct regexp *)SvANY(r);
6556
6557     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6558
6559     if (rx && RXp_PAREN_NAMES(rx)) {
6560         if (flags & RXapif_ALL) {
6561             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6562         } else {
6563             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6564             if (sv) {
6565                 SvREFCNT_dec(sv);
6566                 return TRUE;
6567             } else {
6568                 return FALSE;
6569             }
6570         }
6571     } else {
6572         return FALSE;
6573     }
6574 }
6575
6576 SV*
6577 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6578 {
6579     struct regexp *const rx = (struct regexp *)SvANY(r);
6580
6581     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6582
6583     if ( rx && RXp_PAREN_NAMES(rx) ) {
6584         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6585
6586         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6587     } else {
6588         return FALSE;
6589     }
6590 }
6591
6592 SV*
6593 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6594 {
6595     struct regexp *const rx = (struct regexp *)SvANY(r);
6596     GET_RE_DEBUG_FLAGS_DECL;
6597
6598     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6599
6600     if (rx && RXp_PAREN_NAMES(rx)) {
6601         HV *hv = RXp_PAREN_NAMES(rx);
6602         HE *temphe;
6603         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6604             IV i;
6605             IV parno = 0;
6606             SV* sv_dat = HeVAL(temphe);
6607             I32 *nums = (I32*)SvPVX(sv_dat);
6608             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6609                 if ((I32)(rx->lastparen) >= nums[i] &&
6610                     rx->offs[nums[i]].start != -1 &&
6611                     rx->offs[nums[i]].end != -1)
6612                 {
6613                     parno = nums[i];
6614                     break;
6615                 }
6616             }
6617             if (parno || flags & RXapif_ALL) {
6618                 return newSVhek(HeKEY_hek(temphe));
6619             }
6620         }
6621     }
6622     return NULL;
6623 }
6624
6625 SV*
6626 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6627 {
6628     SV *ret;
6629     AV *av;
6630     I32 length;
6631     struct regexp *const rx = (struct regexp *)SvANY(r);
6632
6633     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6634
6635     if (rx && RXp_PAREN_NAMES(rx)) {
6636         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6637             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6638         } else if (flags & RXapif_ONE) {
6639             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6640             av = MUTABLE_AV(SvRV(ret));
6641             length = av_len(av);
6642             SvREFCNT_dec(ret);
6643             return newSViv(length + 1);
6644         } else {
6645             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6646             return NULL;
6647         }
6648     }
6649     return &PL_sv_undef;
6650 }
6651
6652 SV*
6653 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6654 {
6655     struct regexp *const rx = (struct regexp *)SvANY(r);
6656     AV *av = newAV();
6657
6658     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6659
6660     if (rx && RXp_PAREN_NAMES(rx)) {
6661         HV *hv= RXp_PAREN_NAMES(rx);
6662         HE *temphe;
6663         (void)hv_iterinit(hv);
6664         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6665             IV i;
6666             IV parno = 0;
6667             SV* sv_dat = HeVAL(temphe);
6668             I32 *nums = (I32*)SvPVX(sv_dat);
6669             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6670                 if ((I32)(rx->lastparen) >= nums[i] &&
6671                     rx->offs[nums[i]].start != -1 &&
6672                     rx->offs[nums[i]].end != -1)
6673                 {
6674                     parno = nums[i];
6675                     break;
6676                 }
6677             }
6678             if (parno || flags & RXapif_ALL) {
6679                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6680             }
6681         }
6682     }
6683
6684     return newRV_noinc(MUTABLE_SV(av));
6685 }
6686
6687 void
6688 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6689                              SV * const sv)
6690 {
6691     struct regexp *const rx = (struct regexp *)SvANY(r);
6692     char *s = NULL;
6693     I32 i = 0;
6694     I32 s1, t1;
6695
6696     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6697
6698     if (!rx->subbeg) {
6699         sv_setsv(sv,&PL_sv_undef);
6700         return;
6701     }
6702     else
6703     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6704         /* $` */
6705         i = rx->offs[0].start;
6706         s = rx->subbeg;
6707     }
6708     else
6709     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6710         /* $' */
6711         s = rx->subbeg + rx->offs[0].end;
6712         i = rx->sublen - rx->offs[0].end;
6713     }
6714     else
6715     if ( 0 <= paren && paren <= (I32)rx->nparens &&
6716         (s1 = rx->offs[paren].start) != -1 &&
6717         (t1 = rx->offs[paren].end) != -1)
6718     {
6719         /* $& $1 ... */
6720         i = t1 - s1;
6721         s = rx->subbeg + s1;
6722     } else {
6723         sv_setsv(sv,&PL_sv_undef);
6724         return;
6725     }
6726     assert(rx->sublen >= (s - rx->subbeg) + i );
6727     if (i >= 0) {
6728         const int oldtainted = PL_tainted;
6729         TAINT_NOT;
6730         sv_setpvn(sv, s, i);
6731         PL_tainted = oldtainted;
6732         if ( (rx->extflags & RXf_CANY_SEEN)
6733             ? (RXp_MATCH_UTF8(rx)
6734                         && (!i || is_utf8_string((U8*)s, i)))
6735             : (RXp_MATCH_UTF8(rx)) )
6736         {
6737             SvUTF8_on(sv);
6738         }
6739         else
6740             SvUTF8_off(sv);
6741         if (PL_tainting) {
6742             if (RXp_MATCH_TAINTED(rx)) {
6743                 if (SvTYPE(sv) >= SVt_PVMG) {
6744                     MAGIC* const mg = SvMAGIC(sv);
6745                     MAGIC* mgt;
6746                     PL_tainted = 1;
6747                     SvMAGIC_set(sv, mg->mg_moremagic);
6748                     SvTAINT(sv);
6749                     if ((mgt = SvMAGIC(sv))) {
6750                         mg->mg_moremagic = mgt;
6751                         SvMAGIC_set(sv, mg);
6752                     }
6753                 } else {
6754                     PL_tainted = 1;
6755                     SvTAINT(sv);
6756                 }
6757             } else
6758                 SvTAINTED_off(sv);
6759         }
6760     } else {
6761         sv_setsv(sv,&PL_sv_undef);
6762         return;
6763     }
6764 }
6765
6766 void
6767 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6768                                                          SV const * const value)
6769 {
6770     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6771
6772     PERL_UNUSED_ARG(rx);
6773     PERL_UNUSED_ARG(paren);
6774     PERL_UNUSED_ARG(value);
6775
6776     if (!PL_localizing)
6777         Perl_croak_no_modify(aTHX);
6778 }
6779
6780 I32
6781 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6782                               const I32 paren)
6783 {
6784     struct regexp *const rx = (struct regexp *)SvANY(r);
6785     I32 i;
6786     I32 s1, t1;
6787
6788     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6789
6790     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6791         switch (paren) {
6792       /* $` / ${^PREMATCH} */
6793       case RX_BUFF_IDX_PREMATCH:
6794         if (rx->offs[0].start != -1) {
6795                         i = rx->offs[0].start;
6796                         if (i > 0) {
6797                                 s1 = 0;
6798                                 t1 = i;
6799                                 goto getlen;
6800                         }
6801             }
6802         return 0;
6803       /* $' / ${^POSTMATCH} */
6804       case RX_BUFF_IDX_POSTMATCH:
6805             if (rx->offs[0].end != -1) {
6806                         i = rx->sublen - rx->offs[0].end;
6807                         if (i > 0) {
6808                                 s1 = rx->offs[0].end;
6809                                 t1 = rx->sublen;
6810                                 goto getlen;
6811                         }
6812             }
6813         return 0;
6814       /* $& / ${^MATCH}, $1, $2, ... */
6815       default:
6816             if (paren <= (I32)rx->nparens &&
6817             (s1 = rx->offs[paren].start) != -1 &&
6818             (t1 = rx->offs[paren].end) != -1)
6819             {
6820             i = t1 - s1;
6821             goto getlen;
6822         } else {
6823             if (ckWARN(WARN_UNINITIALIZED))
6824                 report_uninit((const SV *)sv);
6825             return 0;
6826         }
6827     }
6828   getlen:
6829     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6830         const char * const s = rx->subbeg + s1;
6831         const U8 *ep;
6832         STRLEN el;
6833
6834         i = t1 - s1;
6835         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6836                         i = el;
6837     }
6838     return i;
6839 }
6840
6841 SV*
6842 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6843 {
6844     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6845         PERL_UNUSED_ARG(rx);
6846         if (0)
6847             return NULL;
6848         else
6849             return newSVpvs("Regexp");
6850 }
6851
6852 /* Scans the name of a named buffer from the pattern.
6853  * If flags is REG_RSN_RETURN_NULL returns null.
6854  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6855  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6856  * to the parsed name as looked up in the RExC_paren_names hash.
6857  * If there is an error throws a vFAIL().. type exception.
6858  */
6859
6860 #define REG_RSN_RETURN_NULL    0
6861 #define REG_RSN_RETURN_NAME    1
6862 #define REG_RSN_RETURN_DATA    2
6863
6864 STATIC SV*
6865 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6866 {
6867     char *name_start = RExC_parse;
6868
6869     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6870
6871     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6872          /* skip IDFIRST by using do...while */
6873         if (UTF)
6874             do {
6875                 RExC_parse += UTF8SKIP(RExC_parse);
6876             } while (isALNUM_utf8((U8*)RExC_parse));
6877         else
6878             do {
6879                 RExC_parse++;
6880             } while (isALNUM(*RExC_parse));
6881     } else {
6882         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6883         vFAIL("Group name must start with a non-digit word character");
6884     }
6885     if ( flags ) {
6886         SV* sv_name
6887             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6888                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6889         if ( flags == REG_RSN_RETURN_NAME)
6890             return sv_name;
6891         else if (flags==REG_RSN_RETURN_DATA) {
6892             HE *he_str = NULL;
6893             SV *sv_dat = NULL;
6894             if ( ! sv_name )      /* should not happen*/
6895                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6896             if (RExC_paren_names)
6897                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6898             if ( he_str )
6899                 sv_dat = HeVAL(he_str);
6900             if ( ! sv_dat )
6901                 vFAIL("Reference to nonexistent named group");
6902             return sv_dat;
6903         }
6904         else {
6905             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6906                        (unsigned long) flags);
6907         }
6908         assert(0); /* NOT REACHED */
6909     }
6910     return NULL;
6911 }
6912
6913 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6914     int rem=(int)(RExC_end - RExC_parse);                       \
6915     int cut;                                                    \
6916     int num;                                                    \
6917     int iscut=0;                                                \
6918     if (rem>10) {                                               \
6919         rem=10;                                                 \
6920         iscut=1;                                                \
6921     }                                                           \
6922     cut=10-rem;                                                 \
6923     if (RExC_lastparse!=RExC_parse)                             \
6924         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6925             rem, RExC_parse,                                    \
6926             cut + 4,                                            \
6927             iscut ? "..." : "<"                                 \
6928         );                                                      \
6929     else                                                        \
6930         PerlIO_printf(Perl_debug_log,"%16s","");                \
6931                                                                 \
6932     if (SIZE_ONLY)                                              \
6933        num = RExC_size + 1;                                     \
6934     else                                                        \
6935        num=REG_NODE_NUM(RExC_emit);                             \
6936     if (RExC_lastnum!=num)                                      \
6937        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6938     else                                                        \
6939        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6940     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6941         (int)((depth*2)), "",                                   \
6942         (funcname)                                              \
6943     );                                                          \
6944     RExC_lastnum=num;                                           \
6945     RExC_lastparse=RExC_parse;                                  \
6946 })
6947
6948
6949
6950 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6951     DEBUG_PARSE_MSG((funcname));                            \
6952     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6953 })
6954 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6955     DEBUG_PARSE_MSG((funcname));                            \
6956     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6957 })
6958
6959 /* This section of code defines the inversion list object and its methods.  The
6960  * interfaces are highly subject to change, so as much as possible is static to
6961  * this file.  An inversion list is here implemented as a malloc'd C UV array
6962  * with some added info that is placed as UVs at the beginning in a header
6963  * portion.  An inversion list for Unicode is an array of code points, sorted
6964  * by ordinal number.  The zeroth element is the first code point in the list.
6965  * The 1th element is the first element beyond that not in the list.  In other
6966  * words, the first range is
6967  *  invlist[0]..(invlist[1]-1)
6968  * The other ranges follow.  Thus every element whose index is divisible by two
6969  * marks the beginning of a range that is in the list, and every element not
6970  * divisible by two marks the beginning of a range not in the list.  A single
6971  * element inversion list that contains the single code point N generally
6972  * consists of two elements
6973  *  invlist[0] == N
6974  *  invlist[1] == N+1
6975  * (The exception is when N is the highest representable value on the
6976  * machine, in which case the list containing just it would be a single
6977  * element, itself.  By extension, if the last range in the list extends to
6978  * infinity, then the first element of that range will be in the inversion list
6979  * at a position that is divisible by two, and is the final element in the
6980  * list.)
6981  * Taking the complement (inverting) an inversion list is quite simple, if the
6982  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6983  * This implementation reserves an element at the beginning of each inversion list
6984  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6985  * beginning of the list is either that element if 0, or the next one if 1.
6986  *
6987  * More about inversion lists can be found in "Unicode Demystified"
6988  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6989  * More will be coming when functionality is added later.
6990  *
6991  * The inversion list data structure is currently implemented as an SV pointing
6992  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6993  * array of UV whose memory management is automatically handled by the existing
6994  * facilities for SV's.
6995  *
6996  * Some of the methods should always be private to the implementation, and some
6997  * should eventually be made public */
6998
6999 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
7000 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
7001
7002 /* This is a combination of a version and data structure type, so that one
7003  * being passed in can be validated to be an inversion list of the correct
7004  * vintage.  When the structure of the header is changed, a new random number
7005  * in the range 2**31-1 should be generated and the new() method changed to
7006  * insert that at this location.  Then, if an auxiliary program doesn't change
7007  * correspondingly, it will be discovered immediately */
7008 #define INVLIST_VERSION_ID_OFFSET 2
7009 #define INVLIST_VERSION_ID 1064334010
7010
7011 /* For safety, when adding new elements, remember to #undef them at the end of
7012  * the inversion list code section */
7013
7014 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
7015 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
7016  * contains the code point U+00000, and begins here.  If 1, the inversion list
7017  * doesn't contain U+0000, and it begins at the next UV in the array.
7018  * Inverting an inversion list consists of adding or removing the 0 at the
7019  * beginning of it.  By reserving a space for that 0, inversion can be made
7020  * very fast */
7021
7022 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
7023
7024 /* Internally things are UVs */
7025 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7026 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7027
7028 #define INVLIST_INITIAL_LEN 10
7029
7030 PERL_STATIC_INLINE UV*
7031 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7032 {
7033     /* Returns a pointer to the first element in the inversion list's array.
7034      * This is called upon initialization of an inversion list.  Where the
7035      * array begins depends on whether the list has the code point U+0000
7036      * in it or not.  The other parameter tells it whether the code that
7037      * follows this call is about to put a 0 in the inversion list or not.
7038      * The first element is either the element with 0, if 0, or the next one,
7039      * if 1 */
7040
7041     UV* zero = get_invlist_zero_addr(invlist);
7042
7043     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7044
7045     /* Must be empty */
7046     assert(! *get_invlist_len_addr(invlist));
7047
7048     /* 1^1 = 0; 1^0 = 1 */
7049     *zero = 1 ^ will_have_0;
7050     return zero + *zero;
7051 }
7052
7053 PERL_STATIC_INLINE UV*
7054 S_invlist_array(pTHX_ SV* const invlist)
7055 {
7056     /* Returns the pointer to the inversion list's array.  Every time the
7057      * length changes, this needs to be called in case malloc or realloc moved
7058      * it */
7059
7060     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7061
7062     /* Must not be empty.  If these fail, you probably didn't check for <len>
7063      * being non-zero before trying to get the array */
7064     assert(*get_invlist_len_addr(invlist));
7065     assert(*get_invlist_zero_addr(invlist) == 0
7066            || *get_invlist_zero_addr(invlist) == 1);
7067
7068     /* The array begins either at the element reserved for zero if the
7069      * list contains 0 (that element will be set to 0), or otherwise the next
7070      * element (in which case the reserved element will be set to 1). */
7071     return (UV *) (get_invlist_zero_addr(invlist)
7072                    + *get_invlist_zero_addr(invlist));
7073 }
7074
7075 PERL_STATIC_INLINE UV*
7076 S_get_invlist_len_addr(pTHX_ SV* invlist)
7077 {
7078     /* Return the address of the UV that contains the current number
7079      * of used elements in the inversion list */
7080
7081     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7082
7083     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7084 }
7085
7086 PERL_STATIC_INLINE UV
7087 S_invlist_len(pTHX_ SV* const invlist)
7088 {
7089     /* Returns the current number of elements stored in the inversion list's
7090      * array */
7091
7092     PERL_ARGS_ASSERT_INVLIST_LEN;
7093
7094     return *get_invlist_len_addr(invlist);
7095 }
7096
7097 PERL_STATIC_INLINE void
7098 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7099 {
7100     /* Sets the current number of elements stored in the inversion list */
7101
7102     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7103
7104     *get_invlist_len_addr(invlist) = len;
7105
7106     assert(len <= SvLEN(invlist));
7107
7108     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7109     /* If the list contains U+0000, that element is part of the header,
7110      * and should not be counted as part of the array.  It will contain
7111      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7112      * subtract:
7113      *  SvCUR_set(invlist,
7114      *            TO_INTERNAL_SIZE(len
7115      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7116      * But, this is only valid if len is not 0.  The consequences of not doing
7117      * this is that the memory allocation code may think that 1 more UV is
7118      * being used than actually is, and so might do an unnecessary grow.  That
7119      * seems worth not bothering to make this the precise amount.
7120      *
7121      * Note that when inverting, SvCUR shouldn't change */
7122 }
7123
7124 PERL_STATIC_INLINE UV
7125 S_invlist_max(pTHX_ SV* const invlist)
7126 {
7127     /* Returns the maximum number of elements storable in the inversion list's
7128      * array, without having to realloc() */
7129
7130     PERL_ARGS_ASSERT_INVLIST_MAX;
7131
7132     return FROM_INTERNAL_SIZE(SvLEN(invlist));
7133 }
7134
7135 PERL_STATIC_INLINE UV*
7136 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7137 {
7138     /* Return the address of the UV that is reserved to hold 0 if the inversion
7139      * list contains 0.  This has to be the last element of the heading, as the
7140      * list proper starts with either it if 0, or the next element if not.
7141      * (But we force it to contain either 0 or 1) */
7142
7143     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7144
7145     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7146 }
7147
7148 #ifndef PERL_IN_XSUB_RE
7149 SV*
7150 Perl__new_invlist(pTHX_ IV initial_size)
7151 {
7152
7153     /* Return a pointer to a newly constructed inversion list, with enough
7154      * space to store 'initial_size' elements.  If that number is negative, a
7155      * system default is used instead */
7156
7157     SV* new_list;
7158
7159     if (initial_size < 0) {
7160         initial_size = INVLIST_INITIAL_LEN;
7161     }
7162
7163     /* Allocate the initial space */
7164     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7165     invlist_set_len(new_list, 0);
7166
7167     /* Force iterinit() to be used to get iteration to work */
7168     *get_invlist_iter_addr(new_list) = UV_MAX;
7169
7170     /* This should force a segfault if a method doesn't initialize this
7171      * properly */
7172     *get_invlist_zero_addr(new_list) = UV_MAX;
7173
7174     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7175 #if HEADER_LENGTH != 4
7176 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7177 #endif
7178
7179     return new_list;
7180 }
7181 #endif
7182
7183 STATIC SV*
7184 S__new_invlist_C_array(pTHX_ UV* list)
7185 {
7186     /* Return a pointer to a newly constructed inversion list, initialized to
7187      * point to <list>, which has to be in the exact correct inversion list
7188      * form, including internal fields.  Thus this is a dangerous routine that
7189      * should not be used in the wrong hands */
7190
7191     SV* invlist = newSV_type(SVt_PV);
7192
7193     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7194
7195     SvPV_set(invlist, (char *) list);
7196     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7197                                shouldn't touch it */
7198     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7199
7200     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7201         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7202     }
7203
7204     return invlist;
7205 }
7206
7207 STATIC void
7208 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7209 {
7210     /* Grow the maximum size of an inversion list */
7211
7212     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7213
7214     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7215 }
7216
7217 PERL_STATIC_INLINE void
7218 S_invlist_trim(pTHX_ SV* const invlist)
7219 {
7220     PERL_ARGS_ASSERT_INVLIST_TRIM;
7221
7222     /* Change the length of the inversion list to how many entries it currently
7223      * has */
7224
7225     SvPV_shrink_to_cur((SV *) invlist);
7226 }
7227
7228 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7229  * etc */
7230 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7231 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
7232
7233 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7234
7235 STATIC void
7236 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7237 {
7238    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7239     * the end of the inversion list.  The range must be above any existing
7240     * ones. */
7241
7242     UV* array;
7243     UV max = invlist_max(invlist);
7244     UV len = invlist_len(invlist);
7245
7246     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7247
7248     if (len == 0) { /* Empty lists must be initialized */
7249         array = _invlist_array_init(invlist, start == 0);
7250     }
7251     else {
7252         /* Here, the existing list is non-empty. The current max entry in the
7253          * list is generally the first value not in the set, except when the
7254          * set extends to the end of permissible values, in which case it is
7255          * the first entry in that final set, and so this call is an attempt to
7256          * append out-of-order */
7257
7258         UV final_element = len - 1;
7259         array = invlist_array(invlist);
7260         if (array[final_element] > start
7261             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7262         {
7263             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7264                        array[final_element], start,
7265                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7266         }
7267
7268         /* Here, it is a legal append.  If the new range begins with the first
7269          * value not in the set, it is extending the set, so the new first
7270          * value not in the set is one greater than the newly extended range.
7271          * */
7272         if (array[final_element] == start) {
7273             if (end != UV_MAX) {
7274                 array[final_element] = end + 1;
7275             }
7276             else {
7277                 /* But if the end is the maximum representable on the machine,
7278                  * just let the range that this would extend to have no end */
7279                 invlist_set_len(invlist, len - 1);
7280             }
7281             return;
7282         }
7283     }
7284
7285     /* Here the new range doesn't extend any existing set.  Add it */
7286
7287     len += 2;   /* Includes an element each for the start and end of range */
7288
7289     /* If overflows the existing space, extend, which may cause the array to be
7290      * moved */
7291     if (max < len) {
7292         invlist_extend(invlist, len);
7293         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7294                                            failure in invlist_array() */
7295         array = invlist_array(invlist);
7296     }
7297     else {
7298         invlist_set_len(invlist, len);
7299     }
7300
7301     /* The next item on the list starts the range, the one after that is
7302      * one past the new range.  */
7303     array[len - 2] = start;
7304     if (end != UV_MAX) {
7305         array[len - 1] = end + 1;
7306     }
7307     else {
7308         /* But if the end is the maximum representable on the machine, just let
7309          * the range have no end */
7310         invlist_set_len(invlist, len - 1);
7311     }
7312 }
7313
7314 #ifndef PERL_IN_XSUB_RE
7315
7316 IV
7317 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7318 {
7319     /* Searches the inversion list for the entry that contains the input code
7320      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7321      * return value is the index into the list's array of the range that
7322      * contains <cp> */
7323
7324     IV low = 0;
7325     IV high = invlist_len(invlist);
7326     const UV * const array = invlist_array(invlist);
7327
7328     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7329
7330     /* If list is empty or the code point is before the first element, return
7331      * failure. */
7332     if (high == 0 || cp < array[0]) {
7333         return -1;
7334     }
7335
7336     /* Binary search.  What we are looking for is <i> such that
7337      *  array[i] <= cp < array[i+1]
7338      * The loop below converges on the i+1. */
7339     while (low < high) {
7340         IV mid = (low + high) / 2;
7341         if (array[mid] <= cp) {
7342             low = mid + 1;
7343
7344             /* We could do this extra test to exit the loop early.
7345             if (cp < array[low]) {
7346                 return mid;
7347             }
7348             */
7349         }
7350         else { /* cp < array[mid] */
7351             high = mid;
7352         }
7353     }
7354
7355     return high - 1;
7356 }
7357
7358 void
7359 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7360 {
7361     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7362      * but is used when the swash has an inversion list.  This makes this much
7363      * faster, as it uses a binary search instead of a linear one.  This is
7364      * intimately tied to that function, and perhaps should be in utf8.c,
7365      * except it is intimately tied to inversion lists as well.  It assumes
7366      * that <swatch> is all 0's on input */
7367
7368     UV current = start;
7369     const IV len = invlist_len(invlist);
7370     IV i;
7371     const UV * array;
7372
7373     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7374
7375     if (len == 0) { /* Empty inversion list */
7376         return;
7377     }
7378
7379     array = invlist_array(invlist);
7380
7381     /* Find which element it is */
7382     i = _invlist_search(invlist, start);
7383
7384     /* We populate from <start> to <end> */
7385     while (current < end) {
7386         UV upper;
7387
7388         /* The inversion list gives the results for every possible code point
7389          * after the first one in the list.  Only those ranges whose index is
7390          * even are ones that the inversion list matches.  For the odd ones,
7391          * and if the initial code point is not in the list, we have to skip
7392          * forward to the next element */
7393         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7394             i++;
7395             if (i >= len) { /* Finished if beyond the end of the array */
7396                 return;
7397             }
7398             current = array[i];
7399             if (current >= end) {   /* Finished if beyond the end of what we
7400                                        are populating */
7401                 return;
7402             }
7403         }
7404         assert(current >= start);
7405
7406         /* The current range ends one below the next one, except don't go past
7407          * <end> */
7408         i++;
7409         upper = (i < len && array[i] < end) ? array[i] : end;
7410
7411         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7412          * for each code point in it */
7413         for (; current < upper; current++) {
7414             const STRLEN offset = (STRLEN)(current - start);
7415             swatch[offset >> 3] |= 1 << (offset & 7);
7416         }
7417
7418         /* Quit if at the end of the list */
7419         if (i >= len) {
7420
7421             /* But first, have to deal with the highest possible code point on
7422              * the platform.  The previous code assumes that <end> is one
7423              * beyond where we want to populate, but that is impossible at the
7424              * platform's infinity, so have to handle it specially */
7425             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7426             {
7427                 const STRLEN offset = (STRLEN)(end - start);
7428                 swatch[offset >> 3] |= 1 << (offset & 7);
7429             }
7430             return;
7431         }
7432
7433         /* Advance to the next range, which will be for code points not in the
7434          * inversion list */
7435         current = array[i];
7436     }
7437
7438     return;
7439 }
7440
7441 void
7442 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7443 {
7444     /* Take the union of two inversion lists and point <output> to it.  *output
7445      * should be defined upon input, and if it points to one of the two lists,
7446      * the reference count to that list will be decremented.  The first list,
7447      * <a>, may be NULL, in which case a copy of the second list is returned.
7448      * If <complement_b> is TRUE, the union is taken of the complement
7449      * (inversion) of <b> instead of b itself.
7450      *
7451      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7452      * Richard Gillam, published by Addison-Wesley, and explained at some
7453      * length there.  The preface says to incorporate its examples into your
7454      * code at your own risk.
7455      *
7456      * The algorithm is like a merge sort.
7457      *
7458      * XXX A potential performance improvement is to keep track as we go along
7459      * if only one of the inputs contributes to the result, meaning the other
7460      * is a subset of that one.  In that case, we can skip the final copy and
7461      * return the larger of the input lists, but then outside code might need
7462      * to keep track of whether to free the input list or not */
7463
7464     UV* array_a;    /* a's array */
7465     UV* array_b;
7466     UV len_a;       /* length of a's array */
7467     UV len_b;
7468
7469     SV* u;                      /* the resulting union */
7470     UV* array_u;
7471     UV len_u;
7472
7473     UV i_a = 0;             /* current index into a's array */
7474     UV i_b = 0;
7475     UV i_u = 0;
7476
7477     /* running count, as explained in the algorithm source book; items are
7478      * stopped accumulating and are output when the count changes to/from 0.
7479      * The count is incremented when we start a range that's in the set, and
7480      * decremented when we start a range that's not in the set.  So its range
7481      * is 0 to 2.  Only when the count is zero is something not in the set.
7482      */
7483     UV count = 0;
7484
7485     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7486     assert(a != b);
7487
7488     /* If either one is empty, the union is the other one */
7489     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7490         if (*output == a) {
7491             if (a != NULL) {
7492                 SvREFCNT_dec(a);
7493             }
7494         }
7495         if (*output != b) {
7496             *output = invlist_clone(b);
7497             if (complement_b) {
7498                 _invlist_invert(*output);
7499             }
7500         } /* else *output already = b; */
7501         return;
7502     }
7503     else if ((len_b = invlist_len(b)) == 0) {
7504         if (*output == b) {
7505             SvREFCNT_dec(b);
7506         }
7507
7508         /* The complement of an empty list is a list that has everything in it,
7509          * so the union with <a> includes everything too */
7510         if (complement_b) {
7511             if (a == *output) {
7512                 SvREFCNT_dec(a);
7513             }
7514             *output = _new_invlist(1);
7515             _append_range_to_invlist(*output, 0, UV_MAX);
7516         }
7517         else if (*output != a) {
7518             *output = invlist_clone(a);
7519         }
7520         /* else *output already = a; */
7521         return;
7522     }
7523
7524     /* Here both lists exist and are non-empty */
7525     array_a = invlist_array(a);
7526     array_b = invlist_array(b);
7527
7528     /* If are to take the union of 'a' with the complement of b, set it
7529      * up so are looking at b's complement. */
7530     if (complement_b) {
7531
7532         /* To complement, we invert: if the first element is 0, remove it.  To
7533          * do this, we just pretend the array starts one later, and clear the
7534          * flag as we don't have to do anything else later */
7535         if (array_b[0] == 0) {
7536             array_b++;
7537             len_b--;
7538             complement_b = FALSE;
7539         }
7540         else {
7541
7542             /* But if the first element is not zero, we unshift a 0 before the
7543              * array.  The data structure reserves a space for that 0 (which
7544              * should be a '1' right now), so physical shifting is unneeded,
7545              * but temporarily change that element to 0.  Before exiting the
7546              * routine, we must restore the element to '1' */
7547             array_b--;
7548             len_b++;
7549             array_b[0] = 0;
7550         }
7551     }
7552
7553     /* Size the union for the worst case: that the sets are completely
7554      * disjoint */
7555     u = _new_invlist(len_a + len_b);
7556
7557     /* Will contain U+0000 if either component does */
7558     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7559                                       || (len_b > 0 && array_b[0] == 0));
7560
7561     /* Go through each list item by item, stopping when exhausted one of
7562      * them */
7563     while (i_a < len_a && i_b < len_b) {
7564         UV cp;      /* The element to potentially add to the union's array */
7565         bool cp_in_set;   /* is it in the the input list's set or not */
7566
7567         /* We need to take one or the other of the two inputs for the union.
7568          * Since we are merging two sorted lists, we take the smaller of the
7569          * next items.  In case of a tie, we take the one that is in its set
7570          * first.  If we took one not in the set first, it would decrement the
7571          * count, possibly to 0 which would cause it to be output as ending the
7572          * range, and the next time through we would take the same number, and
7573          * output it again as beginning the next range.  By doing it the
7574          * opposite way, there is no possibility that the count will be
7575          * momentarily decremented to 0, and thus the two adjoining ranges will
7576          * be seamlessly merged.  (In a tie and both are in the set or both not
7577          * in the set, it doesn't matter which we take first.) */
7578         if (array_a[i_a] < array_b[i_b]
7579             || (array_a[i_a] == array_b[i_b]
7580                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7581         {
7582             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7583             cp= array_a[i_a++];
7584         }
7585         else {
7586             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7587             cp= array_b[i_b++];
7588         }
7589
7590         /* Here, have chosen which of the two inputs to look at.  Only output
7591          * if the running count changes to/from 0, which marks the
7592          * beginning/end of a range in that's in the set */
7593         if (cp_in_set) {
7594             if (count == 0) {
7595                 array_u[i_u++] = cp;
7596             }
7597             count++;
7598         }
7599         else {
7600             count--;
7601             if (count == 0) {
7602                 array_u[i_u++] = cp;
7603             }
7604         }
7605     }
7606
7607     /* Here, we are finished going through at least one of the lists, which
7608      * means there is something remaining in at most one.  We check if the list
7609      * that hasn't been exhausted is positioned such that we are in the middle
7610      * of a range in its set or not.  (i_a and i_b point to the element beyond
7611      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7612      * is potentially more to output.
7613      * There are four cases:
7614      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7615      *     in the union is entirely from the non-exhausted set.
7616      *  2) Both were in their sets, count is 2.  Nothing further should
7617      *     be output, as everything that remains will be in the exhausted
7618      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7619      *     that
7620      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7621      *     Nothing further should be output because the union includes
7622      *     everything from the exhausted set.  Not decrementing ensures that.
7623      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7624      *     decrementing to 0 insures that we look at the remainder of the
7625      *     non-exhausted set */
7626     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7627         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7628     {
7629         count--;
7630     }
7631
7632     /* The final length is what we've output so far, plus what else is about to
7633      * be output.  (If 'count' is non-zero, then the input list we exhausted
7634      * has everything remaining up to the machine's limit in its set, and hence
7635      * in the union, so there will be no further output. */
7636     len_u = i_u;
7637     if (count == 0) {
7638         /* At most one of the subexpressions will be non-zero */
7639         len_u += (len_a - i_a) + (len_b - i_b);
7640     }
7641
7642     /* Set result to final length, which can change the pointer to array_u, so
7643      * re-find it */
7644     if (len_u != invlist_len(u)) {
7645         invlist_set_len(u, len_u);
7646         invlist_trim(u);
7647         array_u = invlist_array(u);
7648     }
7649
7650     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7651      * the other) ended with everything above it not in its set.  That means
7652      * that the remaining part of the union is precisely the same as the
7653      * non-exhausted list, so can just copy it unchanged.  (If both list were
7654      * exhausted at the same time, then the operations below will be both 0.)
7655      */
7656     if (count == 0) {
7657         IV copy_count; /* At most one will have a non-zero copy count */
7658         if ((copy_count = len_a - i_a) > 0) {
7659             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7660         }
7661         else if ((copy_count = len_b - i_b) > 0) {
7662             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7663         }
7664     }
7665
7666     /*  We may be removing a reference to one of the inputs */
7667     if (a == *output || b == *output) {
7668         SvREFCNT_dec(*output);
7669     }
7670
7671     /* If we've changed b, restore it */
7672     if (complement_b) {
7673         array_b[0] = 1;
7674     }
7675
7676     *output = u;
7677     return;
7678 }
7679
7680 void
7681 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7682 {
7683     /* Take the intersection of two inversion lists and point <i> to it.  *i
7684      * should be defined upon input, and if it points to one of the two lists,
7685      * the reference count to that list will be decremented.
7686      * If <complement_b> is TRUE, the result will be the intersection of <a>
7687      * and the complement (or inversion) of <b> instead of <b> directly.
7688      *
7689      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7690      * Richard Gillam, published by Addison-Wesley, and explained at some
7691      * length there.  The preface says to incorporate its examples into your
7692      * code at your own risk.  In fact, it had bugs
7693      *
7694      * The algorithm is like a merge sort, and is essentially the same as the
7695      * union above
7696      */
7697
7698     UV* array_a;                /* a's array */
7699     UV* array_b;
7700     UV len_a;   /* length of a's array */
7701     UV len_b;
7702
7703     SV* r;                   /* the resulting intersection */
7704     UV* array_r;
7705     UV len_r;
7706
7707     UV i_a = 0;             /* current index into a's array */
7708     UV i_b = 0;
7709     UV i_r = 0;
7710
7711     /* running count, as explained in the algorithm source book; items are
7712      * stopped accumulating and are output when the count changes to/from 2.
7713      * The count is incremented when we start a range that's in the set, and
7714      * decremented when we start a range that's not in the set.  So its range
7715      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7716      */
7717     UV count = 0;
7718
7719     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7720     assert(a != b);
7721
7722     /* Special case if either one is empty */
7723     len_a = invlist_len(a);
7724     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7725
7726         if (len_a != 0 && complement_b) {
7727
7728             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7729              * be empty.  Here, also we are using 'b's complement, which hence
7730              * must be every possible code point.  Thus the intersection is
7731              * simply 'a'. */
7732             if (*i != a) {
7733                 *i = invlist_clone(a);
7734
7735                 if (*i == b) {
7736                     SvREFCNT_dec(b);
7737                 }
7738             }
7739             /* else *i is already 'a' */
7740             return;
7741         }
7742
7743         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7744          * intersection must be empty */
7745         if (*i == a) {
7746             SvREFCNT_dec(a);
7747         }
7748         else if (*i == b) {
7749             SvREFCNT_dec(b);
7750         }
7751         *i = _new_invlist(0);
7752         return;
7753     }
7754
7755     /* Here both lists exist and are non-empty */
7756     array_a = invlist_array(a);
7757     array_b = invlist_array(b);
7758
7759     /* If are to take the intersection of 'a' with the complement of b, set it
7760      * up so are looking at b's complement. */
7761     if (complement_b) {
7762
7763         /* To complement, we invert: if the first element is 0, remove it.  To
7764          * do this, we just pretend the array starts one later, and clear the
7765          * flag as we don't have to do anything else later */
7766         if (array_b[0] == 0) {
7767             array_b++;
7768             len_b--;
7769             complement_b = FALSE;
7770         }
7771         else {
7772
7773             /* But if the first element is not zero, we unshift a 0 before the
7774              * array.  The data structure reserves a space for that 0 (which
7775              * should be a '1' right now), so physical shifting is unneeded,
7776              * but temporarily change that element to 0.  Before exiting the
7777              * routine, we must restore the element to '1' */
7778             array_b--;
7779             len_b++;
7780             array_b[0] = 0;
7781         }
7782     }
7783
7784     /* Size the intersection for the worst case: that the intersection ends up
7785      * fragmenting everything to be completely disjoint */
7786     r= _new_invlist(len_a + len_b);
7787
7788     /* Will contain U+0000 iff both components do */
7789     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7790                                      && len_b > 0 && array_b[0] == 0);
7791
7792     /* Go through each list item by item, stopping when exhausted one of
7793      * them */
7794     while (i_a < len_a && i_b < len_b) {
7795         UV cp;      /* The element to potentially add to the intersection's
7796                        array */
7797         bool cp_in_set; /* Is it in the input list's set or not */
7798
7799         /* We need to take one or the other of the two inputs for the
7800          * intersection.  Since we are merging two sorted lists, we take the
7801          * smaller of the next items.  In case of a tie, we take the one that
7802          * is not in its set first (a difference from the union algorithm).  If
7803          * we took one in the set first, it would increment the count, possibly
7804          * to 2 which would cause it to be output as starting a range in the
7805          * intersection, and the next time through we would take that same
7806          * number, and output it again as ending the set.  By doing it the
7807          * opposite of this, there is no possibility that the count will be
7808          * momentarily incremented to 2.  (In a tie and both are in the set or
7809          * both not in the set, it doesn't matter which we take first.) */
7810         if (array_a[i_a] < array_b[i_b]
7811             || (array_a[i_a] == array_b[i_b]
7812                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7813         {
7814             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7815             cp= array_a[i_a++];
7816         }
7817         else {
7818             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7819             cp= array_b[i_b++];
7820         }
7821
7822         /* Here, have chosen which of the two inputs to look at.  Only output
7823          * if the running count changes to/from 2, which marks the
7824          * beginning/end of a range that's in the intersection */
7825         if (cp_in_set) {
7826             count++;
7827             if (count == 2) {
7828                 array_r[i_r++] = cp;
7829             }
7830         }
7831         else {
7832             if (count == 2) {
7833                 array_r[i_r++] = cp;
7834             }
7835             count--;
7836         }
7837     }
7838
7839     /* Here, we are finished going through at least one of the lists, which
7840      * means there is something remaining in at most one.  We check if the list
7841      * that has been exhausted is positioned such that we are in the middle
7842      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7843      * the ones we care about.)  There are four cases:
7844      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7845      *     nothing left in the intersection.
7846      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7847      *     above 2.  What should be output is exactly that which is in the
7848      *     non-exhausted set, as everything it has is also in the intersection
7849      *     set, and everything it doesn't have can't be in the intersection
7850      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7851      *     gets incremented to 2.  Like the previous case, the intersection is
7852      *     everything that remains in the non-exhausted set.
7853      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7854      *     remains 1.  And the intersection has nothing more. */
7855     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7856         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7857     {
7858         count++;
7859     }
7860
7861     /* The final length is what we've output so far plus what else is in the
7862      * intersection.  At most one of the subexpressions below will be non-zero */
7863     len_r = i_r;
7864     if (count >= 2) {
7865         len_r += (len_a - i_a) + (len_b - i_b);
7866     }
7867
7868     /* Set result to final length, which can change the pointer to array_r, so
7869      * re-find it */
7870     if (len_r != invlist_len(r)) {
7871         invlist_set_len(r, len_r);
7872         invlist_trim(r);
7873         array_r = invlist_array(r);
7874     }
7875
7876     /* Finish outputting any remaining */
7877     if (count >= 2) { /* At most one will have a non-zero copy count */
7878         IV copy_count;
7879         if ((copy_count = len_a - i_a) > 0) {
7880             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7881         }
7882         else if ((copy_count = len_b - i_b) > 0) {
7883             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7884         }
7885     }
7886
7887     /*  We may be removing a reference to one of the inputs */
7888     if (a == *i || b == *i) {
7889         SvREFCNT_dec(*i);
7890     }
7891
7892     /* If we've changed b, restore it */
7893     if (complement_b) {
7894         array_b[0] = 1;
7895     }
7896
7897     *i = r;
7898     return;
7899 }
7900
7901 SV*
7902 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7903 {
7904     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7905      * set.  A pointer to the inversion list is returned.  This may actually be
7906      * a new list, in which case the passed in one has been destroyed.  The
7907      * passed in inversion list can be NULL, in which case a new one is created
7908      * with just the one range in it */
7909
7910     SV* range_invlist;
7911     UV len;
7912
7913     if (invlist == NULL) {
7914         invlist = _new_invlist(2);
7915         len = 0;
7916     }
7917     else {
7918         len = invlist_len(invlist);
7919     }
7920
7921     /* If comes after the final entry, can just append it to the end */
7922     if (len == 0
7923         || start >= invlist_array(invlist)
7924                                     [invlist_len(invlist) - 1])
7925     {
7926         _append_range_to_invlist(invlist, start, end);
7927         return invlist;
7928     }
7929
7930     /* Here, can't just append things, create and return a new inversion list
7931      * which is the union of this range and the existing inversion list */
7932     range_invlist = _new_invlist(2);
7933     _append_range_to_invlist(range_invlist, start, end);
7934
7935     _invlist_union(invlist, range_invlist, &invlist);
7936
7937     /* The temporary can be freed */
7938     SvREFCNT_dec(range_invlist);
7939
7940     return invlist;
7941 }
7942
7943 #endif
7944
7945 PERL_STATIC_INLINE bool
7946 S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
7947 {
7948     /* Does <invlist> contain code point <cp> as part of the set? */
7949
7950     IV index = _invlist_search(invlist, cp);
7951
7952     PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
7953
7954     return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index);
7955 }
7956
7957 PERL_STATIC_INLINE SV*
7958 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7959     return _add_range_to_invlist(invlist, cp, cp);
7960 }
7961
7962 #ifndef PERL_IN_XSUB_RE
7963 void
7964 Perl__invlist_invert(pTHX_ SV* const invlist)
7965 {
7966     /* Complement the input inversion list.  This adds a 0 if the list didn't
7967      * have a zero; removes it otherwise.  As described above, the data
7968      * structure is set up so that this is very efficient */
7969
7970     UV* len_pos = get_invlist_len_addr(invlist);
7971
7972     PERL_ARGS_ASSERT__INVLIST_INVERT;
7973
7974     /* The inverse of matching nothing is matching everything */
7975     if (*len_pos == 0) {
7976         _append_range_to_invlist(invlist, 0, UV_MAX);
7977         return;
7978     }
7979
7980     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7981      * zero element was a 0, so it is being removed, so the length decrements
7982      * by 1; and vice-versa.  SvCUR is unaffected */
7983     if (*get_invlist_zero_addr(invlist) ^= 1) {
7984         (*len_pos)--;
7985     }
7986     else {
7987         (*len_pos)++;
7988     }
7989 }
7990
7991 void
7992 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7993 {
7994     /* Complement the input inversion list (which must be a Unicode property,
7995      * all of which don't match above the Unicode maximum code point.)  And
7996      * Perl has chosen to not have the inversion match above that either.  This
7997      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7998      */
7999
8000     UV len;
8001     UV* array;
8002
8003     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8004
8005     _invlist_invert(invlist);
8006
8007     len = invlist_len(invlist);
8008
8009     if (len != 0) { /* If empty do nothing */
8010         array = invlist_array(invlist);
8011         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8012             /* Add 0x110000.  First, grow if necessary */
8013             len++;
8014             if (invlist_max(invlist) < len) {
8015                 invlist_extend(invlist, len);
8016                 array = invlist_array(invlist);
8017             }
8018             invlist_set_len(invlist, len);
8019             array[len - 1] = PERL_UNICODE_MAX + 1;
8020         }
8021         else {  /* Remove the 0x110000 */
8022             invlist_set_len(invlist, len - 1);
8023         }
8024     }
8025
8026     return;
8027 }
8028 #endif
8029
8030 PERL_STATIC_INLINE SV*
8031 S_invlist_clone(pTHX_ SV* const invlist)
8032 {
8033
8034     /* Return a new inversion list that is a copy of the input one, which is
8035      * unchanged */
8036
8037     /* Need to allocate extra space to accommodate Perl's addition of a
8038      * trailing NUL to SvPV's, since it thinks they are always strings */
8039     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
8040     STRLEN length = SvCUR(invlist);
8041
8042     PERL_ARGS_ASSERT_INVLIST_CLONE;
8043
8044     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8045     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8046
8047     return new_invlist;
8048 }
8049
8050 PERL_STATIC_INLINE UV*
8051 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8052 {
8053     /* Return the address of the UV that contains the current iteration
8054      * position */
8055
8056     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8057
8058     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8059 }
8060
8061 PERL_STATIC_INLINE UV*
8062 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8063 {
8064     /* Return the address of the UV that contains the version id. */
8065
8066     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8067
8068     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8069 }
8070
8071 PERL_STATIC_INLINE void
8072 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8073 {
8074     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8075
8076     *get_invlist_iter_addr(invlist) = 0;
8077 }
8078
8079 STATIC bool
8080 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8081 {
8082     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8083      * This call sets in <*start> and <*end>, the next range in <invlist>.
8084      * Returns <TRUE> if successful and the next call will return the next
8085      * range; <FALSE> if was already at the end of the list.  If the latter,
8086      * <*start> and <*end> are unchanged, and the next call to this function
8087      * will start over at the beginning of the list */
8088
8089     UV* pos = get_invlist_iter_addr(invlist);
8090     UV len = invlist_len(invlist);
8091     UV *array;
8092
8093     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8094
8095     if (*pos >= len) {
8096         *pos = UV_MAX;  /* Force iternit() to be required next time */
8097         return FALSE;
8098     }
8099
8100     array = invlist_array(invlist);
8101
8102     *start = array[(*pos)++];
8103
8104     if (*pos >= len) {
8105         *end = UV_MAX;
8106     }
8107     else {
8108         *end = array[(*pos)++] - 1;
8109     }
8110
8111     return TRUE;
8112 }
8113
8114 PERL_STATIC_INLINE UV
8115 S_invlist_highest(pTHX_ SV* const invlist)
8116 {
8117     /* Returns the highest code point that matches an inversion list.  This API
8118      * has an ambiguity, as it returns 0 under either the highest is actually
8119      * 0, or if the list is empty.  If this distinction matters to you, check
8120      * for emptiness before calling this function */
8121
8122     UV len = invlist_len(invlist);
8123     UV *array;
8124
8125     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8126
8127     if (len == 0) {
8128         return 0;
8129     }
8130
8131     array = invlist_array(invlist);
8132
8133     /* The last element in the array in the inversion list always starts a
8134      * range that goes to infinity.  That range may be for code points that are
8135      * matched in the inversion list, or it may be for ones that aren't
8136      * matched.  In the latter case, the highest code point in the set is one
8137      * less than the beginning of this range; otherwise it is the final element
8138      * of this range: infinity */
8139     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8140            ? UV_MAX
8141            : array[len - 1] - 1;
8142 }
8143
8144 #ifndef PERL_IN_XSUB_RE
8145 SV *
8146 Perl__invlist_contents(pTHX_ SV* const invlist)
8147 {
8148     /* Get the contents of an inversion list into a string SV so that they can
8149      * be printed out.  It uses the format traditionally done for debug tracing
8150      */
8151
8152     UV start, end;
8153     SV* output = newSVpvs("\n");
8154
8155     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8156
8157     invlist_iterinit(invlist);
8158     while (invlist_iternext(invlist, &start, &end)) {
8159         if (end == UV_MAX) {
8160             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8161         }
8162         else if (end != start) {
8163             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8164                     start,       end);
8165         }
8166         else {
8167             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8168         }
8169     }
8170
8171     return output;
8172 }
8173 #endif
8174
8175 #if 0
8176 void
8177 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8178 {
8179     /* Dumps out the ranges in an inversion list.  The string 'header'
8180      * if present is output on a line before the first range */
8181
8182     UV start, end;
8183
8184     if (header && strlen(header)) {
8185         PerlIO_printf(Perl_debug_log, "%s\n", header);
8186     }
8187     invlist_iterinit(invlist);
8188     while (invlist_iternext(invlist, &start, &end)) {
8189         if (end == UV_MAX) {
8190             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8191         }
8192         else {
8193             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8194         }
8195     }
8196 }
8197 #endif
8198
8199 #if 0
8200 bool
8201 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8202 {
8203     /* Return a boolean as to if the two passed in inversion lists are
8204      * identical.  The final argument, if TRUE, says to take the complement of
8205      * the second inversion list before doing the comparison */
8206
8207     UV* array_a = invlist_array(a);
8208     UV* array_b = invlist_array(b);
8209     UV len_a = invlist_len(a);
8210     UV len_b = invlist_len(b);
8211
8212     UV i = 0;               /* current index into the arrays */
8213     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8214
8215     PERL_ARGS_ASSERT__INVLISTEQ;
8216
8217     /* If are to compare 'a' with the complement of b, set it
8218      * up so are looking at b's complement. */
8219     if (complement_b) {
8220
8221         /* The complement of nothing is everything, so <a> would have to have
8222          * just one element, starting at zero (ending at infinity) */
8223         if (len_b == 0) {
8224             return (len_a == 1 && array_a[0] == 0);
8225         }
8226         else if (array_b[0] == 0) {
8227
8228             /* Otherwise, to complement, we invert.  Here, the first element is
8229              * 0, just remove it.  To do this, we just pretend the array starts
8230              * one later, and clear the flag as we don't have to do anything
8231              * else later */
8232
8233             array_b++;
8234             len_b--;
8235             complement_b = FALSE;
8236         }
8237         else {
8238
8239             /* But if the first element is not zero, we unshift a 0 before the
8240              * array.  The data structure reserves a space for that 0 (which
8241              * should be a '1' right now), so physical shifting is unneeded,
8242              * but temporarily change that element to 0.  Before exiting the
8243              * routine, we must restore the element to '1' */
8244             array_b--;
8245             len_b++;
8246             array_b[0] = 0;
8247         }
8248     }
8249
8250     /* Make sure that the lengths are the same, as well as the final element
8251      * before looping through the remainder.  (Thus we test the length, final,
8252      * and first elements right off the bat) */
8253     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8254         retval = FALSE;
8255     }
8256     else for (i = 0; i < len_a - 1; i++) {
8257         if (array_a[i] != array_b[i]) {
8258             retval = FALSE;
8259             break;
8260         }
8261     }
8262
8263     if (complement_b) {
8264         array_b[0] = 1;
8265     }
8266     return retval;
8267 }
8268 #endif
8269
8270 #undef HEADER_LENGTH
8271 #undef INVLIST_INITIAL_LENGTH
8272 #undef TO_INTERNAL_SIZE
8273 #undef FROM_INTERNAL_SIZE
8274 #undef INVLIST_LEN_OFFSET
8275 #undef INVLIST_ZERO_OFFSET
8276 #undef INVLIST_ITER_OFFSET
8277 #undef INVLIST_VERSION_ID
8278
8279 /* End of inversion list object */
8280
8281 /*
8282  - reg - regular expression, i.e. main body or parenthesized thing
8283  *
8284  * Caller must absorb opening parenthesis.
8285  *
8286  * Combining parenthesis handling with the base level of regular expression
8287  * is a trifle forced, but the need to tie the tails of the branches to what
8288  * follows makes it hard to avoid.
8289  */
8290 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8291 #ifdef DEBUGGING
8292 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8293 #else
8294 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8295 #endif
8296
8297 STATIC regnode *
8298 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8299     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8300 {
8301     dVAR;
8302     regnode *ret;               /* Will be the head of the group. */
8303     regnode *br;
8304     regnode *lastbr;
8305     regnode *ender = NULL;
8306     I32 parno = 0;
8307     I32 flags;
8308     U32 oregflags = RExC_flags;
8309     bool have_branch = 0;
8310     bool is_open = 0;
8311     I32 freeze_paren = 0;
8312     I32 after_freeze = 0;
8313
8314     /* for (?g), (?gc), and (?o) warnings; warning
8315        about (?c) will warn about (?g) -- japhy    */
8316
8317 #define WASTED_O  0x01
8318 #define WASTED_G  0x02
8319 #define WASTED_C  0x04
8320 #define WASTED_GC (0x02|0x04)
8321     I32 wastedflags = 0x00;
8322
8323     char * parse_start = RExC_parse; /* MJD */
8324     char * const oregcomp_parse = RExC_parse;
8325
8326     GET_RE_DEBUG_FLAGS_DECL;
8327
8328     PERL_ARGS_ASSERT_REG;
8329     DEBUG_PARSE("reg ");
8330
8331     *flagp = 0;                         /* Tentatively. */
8332
8333
8334     /* Make an OPEN node, if parenthesized. */
8335     if (paren) {
8336         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8337             char *start_verb = RExC_parse;
8338             STRLEN verb_len = 0;
8339             char *start_arg = NULL;
8340             unsigned char op = 0;
8341             int argok = 1;
8342             int internal_argval = 0; /* internal_argval is only useful if !argok */
8343             while ( *RExC_parse && *RExC_parse != ')' ) {
8344                 if ( *RExC_parse == ':' ) {
8345                     start_arg = RExC_parse + 1;
8346                     break;
8347                 }
8348                 RExC_parse++;
8349             }
8350             ++start_verb;
8351             verb_len = RExC_parse - start_verb;
8352             if ( start_arg ) {
8353                 RExC_parse++;
8354                 while ( *RExC_parse && *RExC_parse != ')' )
8355                     RExC_parse++;
8356                 if ( *RExC_parse != ')' )
8357                     vFAIL("Unterminated verb pattern argument");
8358                 if ( RExC_parse == start_arg )
8359                     start_arg = NULL;
8360             } else {
8361                 if ( *RExC_parse != ')' )
8362                     vFAIL("Unterminated verb pattern");
8363             }
8364
8365             switch ( *start_verb ) {
8366             case 'A':  /* (*ACCEPT) */
8367                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8368                     op = ACCEPT;
8369                     internal_argval = RExC_nestroot;
8370                 }
8371                 break;
8372             case 'C':  /* (*COMMIT) */
8373                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8374                     op = COMMIT;
8375                 break;
8376             case 'F':  /* (*FAIL) */
8377                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8378                     op = OPFAIL;
8379                     argok = 0;
8380                 }
8381                 break;
8382             case ':':  /* (*:NAME) */
8383             case 'M':  /* (*MARK:NAME) */
8384                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8385                     op = MARKPOINT;
8386                     argok = -1;
8387                 }
8388                 break;
8389             case 'P':  /* (*PRUNE) */
8390                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8391                     op = PRUNE;
8392                 break;
8393             case 'S':   /* (*SKIP) */
8394                 if ( memEQs(start_verb,verb_len,"SKIP") )
8395                     op = SKIP;
8396                 break;
8397             case 'T':  /* (*THEN) */
8398                 /* [19:06] <TimToady> :: is then */
8399                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8400                     op = CUTGROUP;
8401                     RExC_seen |= REG_SEEN_CUTGROUP;
8402                 }
8403                 break;
8404             }
8405             if ( ! op ) {
8406                 RExC_parse++;
8407                 vFAIL3("Unknown verb pattern '%.*s'",
8408                     verb_len, start_verb);
8409             }
8410             if ( argok ) {
8411                 if ( start_arg && internal_argval ) {
8412                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8413                         verb_len, start_verb);
8414                 } else if ( argok < 0 && !start_arg ) {
8415                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8416                         verb_len, start_verb);
8417                 } else {
8418                     ret = reganode(pRExC_state, op, internal_argval);
8419                     if ( ! internal_argval && ! SIZE_ONLY ) {
8420                         if (start_arg) {
8421                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8422                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8423                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8424                             ret->flags = 0;
8425                         } else {
8426                             ret->flags = 1;
8427                         }
8428                     }
8429                 }
8430                 if (!internal_argval)
8431                     RExC_seen |= REG_SEEN_VERBARG;
8432             } else if ( start_arg ) {
8433                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8434                         verb_len, start_verb);
8435             } else {
8436                 ret = reg_node(pRExC_state, op);
8437             }
8438             nextchar(pRExC_state);
8439             return ret;
8440         } else
8441         if (*RExC_parse == '?') { /* (?...) */
8442             bool is_logical = 0;
8443             const char * const seqstart = RExC_parse;
8444             bool has_use_defaults = FALSE;
8445
8446             RExC_parse++;
8447             paren = *RExC_parse++;
8448             ret = NULL;                 /* For look-ahead/behind. */
8449             switch (paren) {
8450
8451             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8452                 paren = *RExC_parse++;
8453                 if ( paren == '<')         /* (?P<...>) named capture */
8454                     goto named_capture;
8455                 else if (paren == '>') {   /* (?P>name) named recursion */
8456                     goto named_recursion;
8457                 }
8458                 else if (paren == '=') {   /* (?P=...)  named backref */
8459                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8460                        you change this make sure you change that */
8461                     char* name_start = RExC_parse;
8462                     U32 num = 0;
8463                     SV *sv_dat = reg_scan_name(pRExC_state,
8464                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8465                     if (RExC_parse == name_start || *RExC_parse != ')')
8466                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8467
8468                     if (!SIZE_ONLY) {
8469                         num = add_data( pRExC_state, 1, "S" );
8470                         RExC_rxi->data->data[num]=(void*)sv_dat;
8471                         SvREFCNT_inc_simple_void(sv_dat);
8472                     }
8473                     RExC_sawback = 1;
8474                     ret = reganode(pRExC_state,
8475                                    ((! FOLD)
8476                                      ? NREF
8477                                      : (ASCII_FOLD_RESTRICTED)
8478                                        ? NREFFA
8479                                        : (AT_LEAST_UNI_SEMANTICS)
8480                                          ? NREFFU
8481                                          : (LOC)
8482                                            ? NREFFL
8483                                            : NREFF),
8484                                     num);
8485                     *flagp |= HASWIDTH;
8486
8487                     Set_Node_Offset(ret, parse_start+1);
8488                     Set_Node_Cur_Length(ret); /* MJD */
8489
8490                     nextchar(pRExC_state);
8491                     return ret;
8492                 }
8493                 RExC_parse++;
8494                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8495                 /*NOTREACHED*/
8496             case '<':           /* (?<...) */
8497                 if (*RExC_parse == '!')
8498                     paren = ',';
8499                 else if (*RExC_parse != '=')
8500               named_capture:
8501                 {               /* (?<...>) */
8502                     char *name_start;
8503                     SV *svname;
8504                     paren= '>';
8505             case '\'':          /* (?'...') */
8506                     name_start= RExC_parse;
8507                     svname = reg_scan_name(pRExC_state,
8508                         SIZE_ONLY ?  /* reverse test from the others */
8509                         REG_RSN_RETURN_NAME :
8510                         REG_RSN_RETURN_NULL);
8511                     if (RExC_parse == name_start) {
8512                         RExC_parse++;
8513                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8514                         /*NOTREACHED*/
8515                     }
8516                     if (*RExC_parse != paren)
8517                         vFAIL2("Sequence (?%c... not terminated",
8518                             paren=='>' ? '<' : paren);
8519                     if (SIZE_ONLY) {
8520                         HE *he_str;
8521                         SV *sv_dat = NULL;
8522                         if (!svname) /* shouldn't happen */
8523                             Perl_croak(aTHX_
8524                                 "panic: reg_scan_name returned NULL");
8525                         if (!RExC_paren_names) {
8526                             RExC_paren_names= newHV();
8527                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8528 #ifdef DEBUGGING
8529                             RExC_paren_name_list= newAV();
8530                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8531 #endif
8532                         }
8533                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8534                         if ( he_str )
8535                             sv_dat = HeVAL(he_str);
8536                         if ( ! sv_dat ) {
8537                             /* croak baby croak */
8538                             Perl_croak(aTHX_
8539                                 "panic: paren_name hash element allocation failed");
8540                         } else if ( SvPOK(sv_dat) ) {
8541                             /* (?|...) can mean we have dupes so scan to check
8542                                its already been stored. Maybe a flag indicating
8543                                we are inside such a construct would be useful,
8544                                but the arrays are likely to be quite small, so
8545                                for now we punt -- dmq */
8546                             IV count = SvIV(sv_dat);
8547                             I32 *pv = (I32*)SvPVX(sv_dat);
8548                             IV i;
8549                             for ( i = 0 ; i < count ; i++ ) {
8550                                 if ( pv[i] == RExC_npar ) {
8551                                     count = 0;
8552                                     break;
8553                                 }
8554                             }
8555                             if ( count ) {
8556                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8557                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8558                                 pv[count] = RExC_npar;
8559                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8560                             }
8561                         } else {
8562                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8563                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8564                             SvIOK_on(sv_dat);
8565                             SvIV_set(sv_dat, 1);
8566                         }
8567 #ifdef DEBUGGING
8568                         /* Yes this does cause a memory leak in debugging Perls */
8569                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8570                             SvREFCNT_dec(svname);
8571 #endif
8572
8573                         /*sv_dump(sv_dat);*/
8574                     }
8575                     nextchar(pRExC_state);
8576                     paren = 1;
8577                     goto capturing_parens;
8578                 }
8579                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8580                 RExC_in_lookbehind++;
8581                 RExC_parse++;
8582             case '=':           /* (?=...) */
8583                 RExC_seen_zerolen++;
8584                 break;
8585             case '!':           /* (?!...) */
8586                 RExC_seen_zerolen++;
8587                 if (*RExC_parse == ')') {
8588                     ret=reg_node(pRExC_state, OPFAIL);
8589                     nextchar(pRExC_state);
8590                     return ret;
8591                 }
8592                 break;
8593             case '|':           /* (?|...) */
8594                 /* branch reset, behave like a (?:...) except that
8595                    buffers in alternations share the same numbers */
8596                 paren = ':';
8597                 after_freeze = freeze_paren = RExC_npar;
8598                 break;
8599             case ':':           /* (?:...) */
8600             case '>':           /* (?>...) */
8601                 break;
8602             case '$':           /* (?$...) */
8603             case '@':           /* (?@...) */
8604                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8605                 break;
8606             case '#':           /* (?#...) */
8607                 while (*RExC_parse && *RExC_parse != ')')
8608                     RExC_parse++;
8609                 if (*RExC_parse != ')')
8610                     FAIL("Sequence (?#... not terminated");
8611                 nextchar(pRExC_state);
8612                 *flagp = TRYAGAIN;
8613                 return NULL;
8614             case '0' :           /* (?0) */
8615             case 'R' :           /* (?R) */
8616                 if (*RExC_parse != ')')
8617                     FAIL("Sequence (?R) not terminated");
8618                 ret = reg_node(pRExC_state, GOSTART);
8619                 *flagp |= POSTPONED;
8620                 nextchar(pRExC_state);
8621                 return ret;
8622                 /*notreached*/
8623             { /* named and numeric backreferences */
8624                 I32 num;
8625             case '&':            /* (?&NAME) */
8626                 parse_start = RExC_parse - 1;
8627               named_recursion:
8628                 {
8629                     SV *sv_dat = reg_scan_name(pRExC_state,
8630                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8631                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8632                 }
8633                 goto gen_recurse_regop;
8634                 assert(0); /* NOT REACHED */
8635             case '+':
8636                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8637                     RExC_parse++;
8638                     vFAIL("Illegal pattern");
8639                 }
8640                 goto parse_recursion;
8641                 /* NOT REACHED*/
8642             case '-': /* (?-1) */
8643                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8644                     RExC_parse--; /* rewind to let it be handled later */
8645                     goto parse_flags;
8646                 }
8647                 /*FALLTHROUGH */
8648             case '1': case '2': case '3': case '4': /* (?1) */
8649             case '5': case '6': case '7': case '8': case '9':
8650                 RExC_parse--;
8651               parse_recursion:
8652                 num = atoi(RExC_parse);
8653                 parse_start = RExC_parse - 1; /* MJD */
8654                 if (*RExC_parse == '-')
8655                     RExC_parse++;
8656                 while (isDIGIT(*RExC_parse))
8657                         RExC_parse++;
8658                 if (*RExC_parse!=')')
8659                     vFAIL("Expecting close bracket");
8660
8661               gen_recurse_regop:
8662                 if ( paren == '-' ) {
8663                     /*
8664                     Diagram of capture buffer numbering.
8665                     Top line is the normal capture buffer numbers
8666                     Bottom line is the negative indexing as from
8667                     the X (the (?-2))
8668
8669                     +   1 2    3 4 5 X          6 7
8670                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8671                     -   5 4    3 2 1 X          x x
8672
8673                     */
8674                     num = RExC_npar + num;
8675                     if (num < 1)  {
8676                         RExC_parse++;
8677                         vFAIL("Reference to nonexistent group");
8678                     }
8679                 } else if ( paren == '+' ) {
8680                     num = RExC_npar + num - 1;
8681                 }
8682
8683                 ret = reganode(pRExC_state, GOSUB, num);
8684                 if (!SIZE_ONLY) {
8685                     if (num > (I32)RExC_rx->nparens) {
8686                         RExC_parse++;
8687                         vFAIL("Reference to nonexistent group");
8688                     }
8689                     ARG2L_SET( ret, RExC_recurse_count++);
8690                     RExC_emit++;
8691                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8692                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8693                 } else {
8694                     RExC_size++;
8695                 }
8696                 RExC_seen |= REG_SEEN_RECURSE;
8697                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8698                 Set_Node_Offset(ret, parse_start); /* MJD */
8699
8700                 *flagp |= POSTPONED;
8701                 nextchar(pRExC_state);
8702                 return ret;
8703             } /* named and numeric backreferences */
8704             assert(0); /* NOT REACHED */
8705
8706             case '?':           /* (??...) */
8707                 is_logical = 1;
8708                 if (*RExC_parse != '{') {
8709                     RExC_parse++;
8710                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8711                     /*NOTREACHED*/
8712                 }
8713                 *flagp |= POSTPONED;
8714                 paren = *RExC_parse++;
8715                 /* FALL THROUGH */
8716             case '{':           /* (?{...}) */
8717             {
8718                 U32 n = 0;
8719                 struct reg_code_block *cb;
8720
8721                 RExC_seen_zerolen++;
8722
8723                 if (   !pRExC_state->num_code_blocks
8724                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8725                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8726                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8727                             - RExC_start)
8728                 ) {
8729                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8730                         FAIL("panic: Sequence (?{...}): no code block found\n");
8731                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8732                 }
8733                 /* this is a pre-compiled code block (?{...}) */
8734                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8735                 RExC_parse = RExC_start + cb->end;
8736                 if (!SIZE_ONLY) {
8737                     OP *o = cb->block;
8738                     if (cb->src_regex) {
8739                         n = add_data(pRExC_state, 2, "rl");
8740                         RExC_rxi->data->data[n] =
8741                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8742                         RExC_rxi->data->data[n+1] = (void*)o;
8743                     }
8744                     else {
8745                         n = add_data(pRExC_state, 1,
8746                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8747                         RExC_rxi->data->data[n] = (void*)o;
8748                     }
8749                 }
8750                 pRExC_state->code_index++;
8751                 nextchar(pRExC_state);
8752
8753                 if (is_logical) {
8754                     regnode *eval;
8755                     ret = reg_node(pRExC_state, LOGICAL);
8756                     eval = reganode(pRExC_state, EVAL, n);
8757                     if (!SIZE_ONLY) {
8758                         ret->flags = 2;
8759                         /* for later propagation into (??{}) return value */
8760                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8761                     }
8762                     REGTAIL(pRExC_state, ret, eval);
8763                     /* deal with the length of this later - MJD */
8764                     return ret;
8765                 }
8766                 ret = reganode(pRExC_state, EVAL, n);
8767                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8768                 Set_Node_Offset(ret, parse_start);
8769                 return ret;
8770             }
8771             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8772             {
8773                 int is_define= 0;
8774                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8775                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8776                         || RExC_parse[1] == '<'
8777                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8778                         I32 flag;
8779
8780                         ret = reg_node(pRExC_state, LOGICAL);
8781                         if (!SIZE_ONLY)
8782                             ret->flags = 1;
8783                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8784                         goto insert_if;
8785                     }
8786                 }
8787                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8788                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8789                 {
8790                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8791                     char *name_start= RExC_parse++;
8792                     U32 num = 0;
8793                     SV *sv_dat=reg_scan_name(pRExC_state,
8794                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8795                     if (RExC_parse == name_start || *RExC_parse != ch)
8796                         vFAIL2("Sequence (?(%c... not terminated",
8797                             (ch == '>' ? '<' : ch));
8798                     RExC_parse++;
8799                     if (!SIZE_ONLY) {
8800                         num = add_data( pRExC_state, 1, "S" );
8801                         RExC_rxi->data->data[num]=(void*)sv_dat;
8802                         SvREFCNT_inc_simple_void(sv_dat);
8803                     }
8804                     ret = reganode(pRExC_state,NGROUPP,num);
8805                     goto insert_if_check_paren;
8806                 }
8807                 else if (RExC_parse[0] == 'D' &&
8808                          RExC_parse[1] == 'E' &&
8809                          RExC_parse[2] == 'F' &&
8810                          RExC_parse[3] == 'I' &&
8811                          RExC_parse[4] == 'N' &&
8812                          RExC_parse[5] == 'E')
8813                 {
8814                     ret = reganode(pRExC_state,DEFINEP,0);
8815                     RExC_parse +=6 ;
8816                     is_define = 1;
8817                     goto insert_if_check_paren;
8818                 }
8819                 else if (RExC_parse[0] == 'R') {
8820                     RExC_parse++;
8821                     parno = 0;
8822                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8823                         parno = atoi(RExC_parse++);
8824                         while (isDIGIT(*RExC_parse))
8825                             RExC_parse++;
8826                     } else if (RExC_parse[0] == '&') {
8827                         SV *sv_dat;
8828                         RExC_parse++;
8829                         sv_dat = reg_scan_name(pRExC_state,
8830                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8831                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8832                     }
8833                     ret = reganode(pRExC_state,INSUBP,parno);
8834                     goto insert_if_check_paren;
8835                 }
8836                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8837                     /* (?(1)...) */
8838                     char c;
8839                     parno = atoi(RExC_parse++);
8840
8841                     while (isDIGIT(*RExC_parse))
8842                         RExC_parse++;
8843                     ret = reganode(pRExC_state, GROUPP, parno);
8844
8845                  insert_if_check_paren:
8846                     if ((c = *nextchar(pRExC_state)) != ')')
8847                         vFAIL("Switch condition not recognized");
8848                   insert_if:
8849                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8850                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8851                     if (br == NULL)
8852                         br = reganode(pRExC_state, LONGJMP, 0);
8853                     else
8854                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8855                     c = *nextchar(pRExC_state);
8856                     if (flags&HASWIDTH)
8857                         *flagp |= HASWIDTH;
8858                     if (c == '|') {
8859                         if (is_define)
8860                             vFAIL("(?(DEFINE)....) does not allow branches");
8861                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8862                         regbranch(pRExC_state, &flags, 1,depth+1);
8863                         REGTAIL(pRExC_state, ret, lastbr);
8864                         if (flags&HASWIDTH)
8865                             *flagp |= HASWIDTH;
8866                         c = *nextchar(pRExC_state);
8867                     }
8868                     else
8869                         lastbr = NULL;
8870                     if (c != ')')
8871                         vFAIL("Switch (?(condition)... contains too many branches");
8872                     ender = reg_node(pRExC_state, TAIL);
8873                     REGTAIL(pRExC_state, br, ender);
8874                     if (lastbr) {
8875                         REGTAIL(pRExC_state, lastbr, ender);
8876                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8877                     }
8878                     else
8879                         REGTAIL(pRExC_state, ret, ender);
8880                     RExC_size++; /* XXX WHY do we need this?!!
8881                                     For large programs it seems to be required
8882                                     but I can't figure out why. -- dmq*/
8883                     return ret;
8884                 }
8885                 else {
8886                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8887                 }
8888             }
8889             case 0:
8890                 RExC_parse--; /* for vFAIL to print correctly */
8891                 vFAIL("Sequence (? incomplete");
8892                 break;
8893             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8894                                        that follow */
8895                 has_use_defaults = TRUE;
8896                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8897                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8898                                                 ? REGEX_UNICODE_CHARSET
8899                                                 : REGEX_DEPENDS_CHARSET);
8900                 goto parse_flags;
8901             default:
8902                 --RExC_parse;
8903                 parse_flags:      /* (?i) */
8904             {
8905                 U32 posflags = 0, negflags = 0;
8906                 U32 *flagsp = &posflags;
8907                 char has_charset_modifier = '\0';
8908                 regex_charset cs = get_regex_charset(RExC_flags);
8909                 if (cs == REGEX_DEPENDS_CHARSET
8910                     && (RExC_utf8 || RExC_uni_semantics))
8911                 {
8912                     cs = REGEX_UNICODE_CHARSET;
8913                 }
8914
8915                 while (*RExC_parse) {
8916                     /* && strchr("iogcmsx", *RExC_parse) */
8917                     /* (?g), (?gc) and (?o) are useless here
8918                        and must be globally applied -- japhy */
8919                     switch (*RExC_parse) {
8920                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8921                     case LOCALE_PAT_MOD:
8922                         if (has_charset_modifier) {
8923                             goto excess_modifier;
8924                         }
8925                         else if (flagsp == &negflags) {
8926                             goto neg_modifier;
8927                         }
8928                         cs = REGEX_LOCALE_CHARSET;
8929                         has_charset_modifier = LOCALE_PAT_MOD;
8930                         RExC_contains_locale = 1;
8931                         break;
8932                     case UNICODE_PAT_MOD:
8933                         if (has_charset_modifier) {
8934                             goto excess_modifier;
8935                         }
8936                         else if (flagsp == &negflags) {
8937                             goto neg_modifier;
8938                         }
8939                         cs = REGEX_UNICODE_CHARSET;
8940                         has_charset_modifier = UNICODE_PAT_MOD;
8941                         break;
8942                     case ASCII_RESTRICT_PAT_MOD:
8943                         if (flagsp == &negflags) {
8944                             goto neg_modifier;
8945                         }
8946                         if (has_charset_modifier) {
8947                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8948                                 goto excess_modifier;
8949                             }
8950                             /* Doubled modifier implies more restricted */
8951                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8952                         }
8953                         else {
8954                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8955                         }
8956                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8957                         break;
8958                     case DEPENDS_PAT_MOD:
8959                         if (has_use_defaults) {
8960                             goto fail_modifiers;
8961                         }
8962                         else if (flagsp == &negflags) {
8963                             goto neg_modifier;
8964                         }
8965                         else if (has_charset_modifier) {
8966                             goto excess_modifier;
8967                         }
8968
8969                         /* The dual charset means unicode semantics if the
8970                          * pattern (or target, not known until runtime) are
8971                          * utf8, or something in the pattern indicates unicode
8972                          * semantics */
8973                         cs = (RExC_utf8 || RExC_uni_semantics)
8974                              ? REGEX_UNICODE_CHARSET
8975                              : REGEX_DEPENDS_CHARSET;
8976                         has_charset_modifier = DEPENDS_PAT_MOD;
8977                         break;
8978                     excess_modifier:
8979                         RExC_parse++;
8980                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8981                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8982                         }
8983                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8984                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8985                         }
8986                         else {
8987                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8988                         }
8989                         /*NOTREACHED*/
8990                     neg_modifier:
8991                         RExC_parse++;
8992                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8993                         /*NOTREACHED*/
8994                     case ONCE_PAT_MOD: /* 'o' */
8995                     case GLOBAL_PAT_MOD: /* 'g' */
8996                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8997                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8998                             if (! (wastedflags & wflagbit) ) {
8999                                 wastedflags |= wflagbit;
9000                                 vWARN5(
9001                                     RExC_parse + 1,
9002                                     "Useless (%s%c) - %suse /%c modifier",
9003                                     flagsp == &negflags ? "?-" : "?",
9004                                     *RExC_parse,
9005                                     flagsp == &negflags ? "don't " : "",
9006                                     *RExC_parse
9007                                 );
9008                             }
9009                         }
9010                         break;
9011
9012                     case CONTINUE_PAT_MOD: /* 'c' */
9013                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9014                             if (! (wastedflags & WASTED_C) ) {
9015                                 wastedflags |= WASTED_GC;
9016                                 vWARN3(
9017                                     RExC_parse + 1,
9018                                     "Useless (%sc) - %suse /gc modifier",
9019                                     flagsp == &negflags ? "?-" : "?",
9020                                     flagsp == &negflags ? "don't " : ""
9021                                 );
9022                             }
9023                         }
9024                         break;
9025                     case KEEPCOPY_PAT_MOD: /* 'p' */
9026                         if (flagsp == &negflags) {
9027                             if (SIZE_ONLY)
9028                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9029                         } else {
9030                             *flagsp |= RXf_PMf_KEEPCOPY;
9031                         }
9032                         break;
9033                     case '-':
9034                         /* A flag is a default iff it is following a minus, so
9035                          * if there is a minus, it means will be trying to
9036                          * re-specify a default which is an error */
9037                         if (has_use_defaults || flagsp == &negflags) {
9038             fail_modifiers:
9039                             RExC_parse++;
9040                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9041                             /*NOTREACHED*/
9042                         }
9043                         flagsp = &negflags;
9044                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9045                         break;
9046                     case ':':
9047                         paren = ':';
9048                         /*FALLTHROUGH*/
9049                     case ')':
9050                         RExC_flags |= posflags;
9051                         RExC_flags &= ~negflags;
9052                         set_regex_charset(&RExC_flags, cs);
9053                         if (paren != ':') {
9054                             oregflags |= posflags;
9055                             oregflags &= ~negflags;
9056                             set_regex_charset(&oregflags, cs);
9057                         }
9058                         nextchar(pRExC_state);
9059                         if (paren != ':') {
9060                             *flagp = TRYAGAIN;
9061                             return NULL;
9062                         } else {
9063                             ret = NULL;
9064                             goto parse_rest;
9065                         }
9066                         /*NOTREACHED*/
9067                     default:
9068                         RExC_parse++;
9069                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9070                         /*NOTREACHED*/
9071                     }
9072                     ++RExC_parse;
9073                 }
9074             }} /* one for the default block, one for the switch */
9075         }
9076         else {                  /* (...) */
9077           capturing_parens:
9078             parno = RExC_npar;
9079             RExC_npar++;
9080
9081             ret = reganode(pRExC_state, OPEN, parno);
9082             if (!SIZE_ONLY ){
9083                 if (!RExC_nestroot)
9084                     RExC_nestroot = parno;
9085                 if (RExC_seen & REG_SEEN_RECURSE
9086                     && !RExC_open_parens[parno-1])
9087                 {
9088                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9089                         "Setting open paren #%"IVdf" to %d\n",
9090                         (IV)parno, REG_NODE_NUM(ret)));
9091                     RExC_open_parens[parno-1]= ret;
9092                 }
9093             }
9094             Set_Node_Length(ret, 1); /* MJD */
9095             Set_Node_Offset(ret, RExC_parse); /* MJD */
9096             is_open = 1;
9097         }
9098     }
9099     else                        /* ! paren */
9100         ret = NULL;
9101
9102    parse_rest:
9103     /* Pick up the branches, linking them together. */
9104     parse_start = RExC_parse;   /* MJD */
9105     br = regbranch(pRExC_state, &flags, 1,depth+1);
9106
9107     /*     branch_len = (paren != 0); */
9108
9109     if (br == NULL)
9110         return(NULL);
9111     if (*RExC_parse == '|') {
9112         if (!SIZE_ONLY && RExC_extralen) {
9113             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9114         }
9115         else {                  /* MJD */
9116             reginsert(pRExC_state, BRANCH, br, depth+1);
9117             Set_Node_Length(br, paren != 0);
9118             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9119         }
9120         have_branch = 1;
9121         if (SIZE_ONLY)
9122             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9123     }
9124     else if (paren == ':') {
9125         *flagp |= flags&SIMPLE;
9126     }
9127     if (is_open) {                              /* Starts with OPEN. */
9128         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9129     }
9130     else if (paren != '?')              /* Not Conditional */
9131         ret = br;
9132     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9133     lastbr = br;
9134     while (*RExC_parse == '|') {
9135         if (!SIZE_ONLY && RExC_extralen) {
9136             ender = reganode(pRExC_state, LONGJMP,0);
9137             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9138         }
9139         if (SIZE_ONLY)
9140             RExC_extralen += 2;         /* Account for LONGJMP. */
9141         nextchar(pRExC_state);
9142         if (freeze_paren) {
9143             if (RExC_npar > after_freeze)
9144                 after_freeze = RExC_npar;
9145             RExC_npar = freeze_paren;
9146         }
9147         br = regbranch(pRExC_state, &flags, 0, depth+1);
9148
9149         if (br == NULL)
9150             return(NULL);
9151         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9152         lastbr = br;
9153         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9154     }
9155
9156     if (have_branch || paren != ':') {
9157         /* Make a closing node, and hook it on the end. */
9158         switch (paren) {
9159         case ':':
9160             ender = reg_node(pRExC_state, TAIL);
9161             break;
9162         case 1:
9163             ender = reganode(pRExC_state, CLOSE, parno);
9164             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9165                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9166                         "Setting close paren #%"IVdf" to %d\n",
9167                         (IV)parno, REG_NODE_NUM(ender)));
9168                 RExC_close_parens[parno-1]= ender;
9169                 if (RExC_nestroot == parno)
9170                     RExC_nestroot = 0;
9171             }
9172             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9173             Set_Node_Length(ender,1); /* MJD */
9174             break;
9175         case '<':
9176         case ',':
9177         case '=':
9178         case '!':
9179             *flagp &= ~HASWIDTH;
9180             /* FALL THROUGH */
9181         case '>':
9182             ender = reg_node(pRExC_state, SUCCEED);
9183             break;
9184         case 0:
9185             ender = reg_node(pRExC_state, END);
9186             if (!SIZE_ONLY) {
9187                 assert(!RExC_opend); /* there can only be one! */
9188                 RExC_opend = ender;
9189             }
9190             break;
9191         }
9192         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9193             SV * const mysv_val1=sv_newmortal();
9194             SV * const mysv_val2=sv_newmortal();
9195             DEBUG_PARSE_MSG("lsbr");
9196             regprop(RExC_rx, mysv_val1, lastbr);
9197             regprop(RExC_rx, mysv_val2, ender);
9198             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9199                           SvPV_nolen_const(mysv_val1),
9200                           (IV)REG_NODE_NUM(lastbr),
9201                           SvPV_nolen_const(mysv_val2),
9202                           (IV)REG_NODE_NUM(ender),
9203                           (IV)(ender - lastbr)
9204             );
9205         });
9206         REGTAIL(pRExC_state, lastbr, ender);
9207
9208         if (have_branch && !SIZE_ONLY) {
9209             char is_nothing= 1;
9210             if (depth==1)
9211                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9212
9213             /* Hook the tails of the branches to the closing node. */
9214             for (br = ret; br; br = regnext(br)) {
9215                 const U8 op = PL_regkind[OP(br)];
9216                 if (op == BRANCH) {
9217                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9218                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9219                         is_nothing= 0;
9220                 }
9221                 else if (op == BRANCHJ) {
9222                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9223                     /* for now we always disable this optimisation * /
9224                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9225                     */
9226                         is_nothing= 0;
9227                 }
9228             }
9229             if (is_nothing) {
9230                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9231                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9232                     SV * const mysv_val1=sv_newmortal();
9233                     SV * const mysv_val2=sv_newmortal();
9234                     DEBUG_PARSE_MSG("NADA");
9235                     regprop(RExC_rx, mysv_val1, ret);
9236                     regprop(RExC_rx, mysv_val2, ender);
9237                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9238                                   SvPV_nolen_const(mysv_val1),
9239                                   (IV)REG_NODE_NUM(ret),
9240                                   SvPV_nolen_const(mysv_val2),
9241                                   (IV)REG_NODE_NUM(ender),
9242                                   (IV)(ender - ret)
9243                     );
9244                 });
9245                 OP(br)= NOTHING;
9246                 if (OP(ender) == TAIL) {
9247                     NEXT_OFF(br)= 0;
9248                     RExC_emit= br + 1;
9249                 } else {
9250                     regnode *opt;
9251                     for ( opt= br + 1; opt < ender ; opt++ )
9252                         OP(opt)= OPTIMIZED;
9253                     NEXT_OFF(br)= ender - br;
9254                 }
9255             }
9256         }
9257     }
9258
9259     {
9260         const char *p;
9261         static const char parens[] = "=!<,>";
9262
9263         if (paren && (p = strchr(parens, paren))) {
9264             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9265             int flag = (p - parens) > 1;
9266
9267             if (paren == '>')
9268                 node = SUSPEND, flag = 0;
9269             reginsert(pRExC_state, node,ret, depth+1);
9270             Set_Node_Cur_Length(ret);
9271             Set_Node_Offset(ret, parse_start + 1);
9272             ret->flags = flag;
9273             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9274         }
9275     }
9276
9277     /* Check for proper termination. */
9278     if (paren) {
9279         RExC_flags = oregflags;
9280         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9281             RExC_parse = oregcomp_parse;
9282             vFAIL("Unmatched (");
9283         }
9284     }
9285     else if (!paren && RExC_parse < RExC_end) {
9286         if (*RExC_parse == ')') {
9287             RExC_parse++;
9288             vFAIL("Unmatched )");
9289         }
9290         else
9291             FAIL("Junk on end of regexp");      /* "Can't happen". */
9292         assert(0); /* NOTREACHED */
9293     }
9294
9295     if (RExC_in_lookbehind) {
9296         RExC_in_lookbehind--;
9297     }
9298     if (after_freeze > RExC_npar)
9299         RExC_npar = after_freeze;
9300     return(ret);
9301 }
9302
9303 /*
9304  - regbranch - one alternative of an | operator
9305  *
9306  * Implements the concatenation operator.
9307  */
9308 STATIC regnode *
9309 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9310 {
9311     dVAR;
9312     regnode *ret;
9313     regnode *chain = NULL;
9314     regnode *latest;
9315     I32 flags = 0, c = 0;
9316     GET_RE_DEBUG_FLAGS_DECL;
9317
9318     PERL_ARGS_ASSERT_REGBRANCH;
9319
9320     DEBUG_PARSE("brnc");
9321
9322     if (first)
9323         ret = NULL;
9324     else {
9325         if (!SIZE_ONLY && RExC_extralen)
9326             ret = reganode(pRExC_state, BRANCHJ,0);
9327         else {
9328             ret = reg_node(pRExC_state, BRANCH);
9329             Set_Node_Length(ret, 1);
9330         }
9331     }
9332
9333     if (!first && SIZE_ONLY)
9334         RExC_extralen += 1;                     /* BRANCHJ */
9335
9336     *flagp = WORST;                     /* Tentatively. */
9337
9338     RExC_parse--;
9339     nextchar(pRExC_state);
9340     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9341         flags &= ~TRYAGAIN;
9342         latest = regpiece(pRExC_state, &flags,depth+1);
9343         if (latest == NULL) {
9344             if (flags & TRYAGAIN)
9345                 continue;
9346             return(NULL);
9347         }
9348         else if (ret == NULL)
9349             ret = latest;
9350         *flagp |= flags&(HASWIDTH|POSTPONED);
9351         if (chain == NULL)      /* First piece. */
9352             *flagp |= flags&SPSTART;
9353         else {
9354             RExC_naughty++;
9355             REGTAIL(pRExC_state, chain, latest);
9356         }
9357         chain = latest;
9358         c++;
9359     }
9360     if (chain == NULL) {        /* Loop ran zero times. */
9361         chain = reg_node(pRExC_state, NOTHING);
9362         if (ret == NULL)
9363             ret = chain;
9364     }
9365     if (c == 1) {
9366         *flagp |= flags&SIMPLE;
9367     }
9368
9369     return ret;
9370 }
9371
9372 /*
9373  - regpiece - something followed by possible [*+?]
9374  *
9375  * Note that the branching code sequences used for ? and the general cases
9376  * of * and + are somewhat optimized:  they use the same NOTHING node as
9377  * both the endmarker for their branch list and the body of the last branch.
9378  * It might seem that this node could be dispensed with entirely, but the
9379  * endmarker role is not redundant.
9380  */
9381 STATIC regnode *
9382 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9383 {
9384     dVAR;
9385     regnode *ret;
9386     char op;
9387     char *next;
9388     I32 flags;
9389     const char * const origparse = RExC_parse;
9390     I32 min;
9391     I32 max = REG_INFTY;
9392 #ifdef RE_TRACK_PATTERN_OFFSETS
9393     char *parse_start;
9394 #endif
9395     const char *maxpos = NULL;
9396     GET_RE_DEBUG_FLAGS_DECL;
9397
9398     PERL_ARGS_ASSERT_REGPIECE;
9399
9400     DEBUG_PARSE("piec");
9401
9402     ret = regatom(pRExC_state, &flags,depth+1);
9403     if (ret == NULL) {
9404         if (flags & TRYAGAIN)
9405             *flagp |= TRYAGAIN;
9406         return(NULL);
9407     }
9408
9409     op = *RExC_parse;
9410
9411     if (op == '{' && regcurly(RExC_parse)) {
9412         maxpos = NULL;
9413 #ifdef RE_TRACK_PATTERN_OFFSETS
9414         parse_start = RExC_parse; /* MJD */
9415 #endif
9416         next = RExC_parse + 1;
9417         while (isDIGIT(*next) || *next == ',') {
9418             if (*next == ',') {
9419                 if (maxpos)
9420                     break;
9421                 else
9422                     maxpos = next;
9423             }
9424             next++;
9425         }
9426         if (*next == '}') {             /* got one */
9427             if (!maxpos)
9428                 maxpos = next;
9429             RExC_parse++;
9430             min = atoi(RExC_parse);
9431             if (*maxpos == ',')
9432                 maxpos++;
9433             else
9434                 maxpos = RExC_parse;
9435             max = atoi(maxpos);
9436             if (!max && *maxpos != '0')
9437                 max = REG_INFTY;                /* meaning "infinity" */
9438             else if (max >= REG_INFTY)
9439                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9440             RExC_parse = next;
9441             nextchar(pRExC_state);
9442
9443         do_curly:
9444             if ((flags&SIMPLE)) {
9445                 RExC_naughty += 2 + RExC_naughty / 2;
9446                 reginsert(pRExC_state, CURLY, ret, depth+1);
9447                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9448                 Set_Node_Cur_Length(ret);
9449             }
9450             else {
9451                 regnode * const w = reg_node(pRExC_state, WHILEM);
9452
9453                 w->flags = 0;
9454                 REGTAIL(pRExC_state, ret, w);
9455                 if (!SIZE_ONLY && RExC_extralen) {
9456                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9457                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9458                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9459                 }
9460                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9461                                 /* MJD hk */
9462                 Set_Node_Offset(ret, parse_start+1);
9463                 Set_Node_Length(ret,
9464                                 op == '{' ? (RExC_parse - parse_start) : 1);
9465
9466                 if (!SIZE_ONLY && RExC_extralen)
9467                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9468                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9469                 if (SIZE_ONLY)
9470                     RExC_whilem_seen++, RExC_extralen += 3;
9471                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9472             }
9473             ret->flags = 0;
9474
9475             if (min > 0)
9476                 *flagp = WORST;
9477             if (max > 0)
9478                 *flagp |= HASWIDTH;
9479             if (max < min)
9480                 vFAIL("Can't do {n,m} with n > m");
9481             if (!SIZE_ONLY) {
9482                 ARG1_SET(ret, (U16)min);
9483                 ARG2_SET(ret, (U16)max);
9484             }
9485
9486             goto nest_check;
9487         }
9488     }
9489
9490     if (!ISMULT1(op)) {
9491         *flagp = flags;
9492         return(ret);
9493     }
9494
9495 #if 0                           /* Now runtime fix should be reliable. */
9496
9497     /* if this is reinstated, don't forget to put this back into perldiag:
9498
9499             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9500
9501            (F) The part of the regexp subject to either the * or + quantifier
9502            could match an empty string. The {#} shows in the regular
9503            expression about where the problem was discovered.
9504
9505     */
9506
9507     if (!(flags&HASWIDTH) && op != '?')
9508       vFAIL("Regexp *+ operand could be empty");
9509 #endif
9510
9511 #ifdef RE_TRACK_PATTERN_OFFSETS
9512     parse_start = RExC_parse;
9513 #endif
9514     nextchar(pRExC_state);
9515
9516     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9517
9518     if (op == '*' && (flags&SIMPLE)) {
9519         reginsert(pRExC_state, STAR, ret, depth+1);
9520         ret->flags = 0;
9521         RExC_naughty += 4;
9522     }
9523     else if (op == '*') {
9524         min = 0;
9525         goto do_curly;
9526     }
9527     else if (op == '+' && (flags&SIMPLE)) {
9528         reginsert(pRExC_state, PLUS, ret, depth+1);
9529         ret->flags = 0;
9530         RExC_naughty += 3;
9531     }
9532     else if (op == '+') {
9533         min = 1;
9534         goto do_curly;
9535     }
9536     else if (op == '?') {
9537         min = 0; max = 1;
9538         goto do_curly;
9539     }
9540   nest_check:
9541     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9542         ckWARN3reg(RExC_parse,
9543                    "%.*s matches null string many times",
9544                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9545                    origparse);
9546     }
9547
9548     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9549         nextchar(pRExC_state);
9550         reginsert(pRExC_state, MINMOD, ret, depth+1);
9551         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9552     }
9553 #ifndef REG_ALLOW_MINMOD_SUSPEND
9554     else
9555 #endif
9556     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9557         regnode *ender;
9558         nextchar(pRExC_state);
9559         ender = reg_node(pRExC_state, SUCCEED);
9560         REGTAIL(pRExC_state, ret, ender);
9561         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9562         ret->flags = 0;
9563         ender = reg_node(pRExC_state, TAIL);
9564         REGTAIL(pRExC_state, ret, ender);
9565         /*ret= ender;*/
9566     }
9567
9568     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9569         RExC_parse++;
9570         vFAIL("Nested quantifiers");
9571     }
9572
9573     return(ret);
9574 }
9575
9576 STATIC bool
9577 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9578 {
9579
9580  /* This is expected to be called by a parser routine that has recognized '\N'
9581    and needs to handle the rest. RExC_parse is expected to point at the first
9582    char following the N at the time of the call.  On successful return,
9583    RExC_parse has been updated to point to just after the sequence identified
9584    by this routine, and <*flagp> has been updated.
9585
9586    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9587    character class.
9588
9589    \N may begin either a named sequence, or if outside a character class, mean
9590    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9591    attempted to decide which, and in the case of a named sequence, converted it
9592    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9593    where c1... are the characters in the sequence.  For single-quoted regexes,
9594    the tokenizer passes the \N sequence through unchanged; this code will not
9595    attempt to determine this nor expand those, instead raising a syntax error.
9596    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9597    or there is no '}', it signals that this \N occurrence means to match a
9598    non-newline.
9599
9600    Only the \N{U+...} form should occur in a character class, for the same
9601    reason that '.' inside a character class means to just match a period: it
9602    just doesn't make sense.
9603
9604    The function raises an error (via vFAIL), and doesn't return for various
9605    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9606    success; it returns FALSE otherwise.
9607
9608    If <valuep> is non-null, it means the caller can accept an input sequence
9609    consisting of a just a single code point; <*valuep> is set to that value
9610    if the input is such.
9611
9612    If <node_p> is non-null it signifies that the caller can accept any other
9613    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9614    is set as follows:
9615     1) \N means not-a-NL: points to a newly created REG_ANY node;
9616     2) \N{}:              points to a new NOTHING node;
9617     3) otherwise:         points to a new EXACT node containing the resolved
9618                           string.
9619    Note that FALSE is returned for single code point sequences if <valuep> is
9620    null.
9621  */
9622
9623     char * endbrace;    /* '}' following the name */
9624     char* p;
9625     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9626                            stream */
9627     bool has_multiple_chars; /* true if the input stream contains a sequence of
9628                                 more than one character */
9629
9630     GET_RE_DEBUG_FLAGS_DECL;
9631
9632     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9633
9634     GET_RE_DEBUG_FLAGS;
9635
9636     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9637
9638     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9639      * modifier.  The other meaning does not */
9640     p = (RExC_flags & RXf_PMf_EXTENDED)
9641         ? regwhite( pRExC_state, RExC_parse )
9642         : RExC_parse;
9643
9644     /* Disambiguate between \N meaning a named character versus \N meaning
9645      * [^\n].  The former is assumed when it can't be the latter. */
9646     if (*p != '{' || regcurly(p)) {
9647         RExC_parse = p;
9648         if (! node_p) {
9649             /* no bare \N in a charclass */
9650             if (in_char_class) {
9651                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9652             }
9653             return FALSE;
9654         }
9655         nextchar(pRExC_state);
9656         *node_p = reg_node(pRExC_state, REG_ANY);
9657         *flagp |= HASWIDTH|SIMPLE;
9658         RExC_naughty++;
9659         RExC_parse--;
9660         Set_Node_Length(*node_p, 1); /* MJD */
9661         return TRUE;
9662     }
9663
9664     /* Here, we have decided it should be a named character or sequence */
9665
9666     /* The test above made sure that the next real character is a '{', but
9667      * under the /x modifier, it could be separated by space (or a comment and
9668      * \n) and this is not allowed (for consistency with \x{...} and the
9669      * tokenizer handling of \N{NAME}). */
9670     if (*RExC_parse != '{') {
9671         vFAIL("Missing braces on \\N{}");
9672     }
9673
9674     RExC_parse++;       /* Skip past the '{' */
9675
9676     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9677         || ! (endbrace == RExC_parse            /* nothing between the {} */
9678               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9679                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9680     {
9681         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9682         vFAIL("\\N{NAME} must be resolved by the lexer");
9683     }
9684
9685     if (endbrace == RExC_parse) {   /* empty: \N{} */
9686         bool ret = TRUE;
9687         if (node_p) {
9688             *node_p = reg_node(pRExC_state,NOTHING);
9689         }
9690         else if (in_char_class) {
9691             if (SIZE_ONLY && in_char_class) {
9692                 ckWARNreg(RExC_parse,
9693                         "Ignoring zero length \\N{} in character class"
9694                 );
9695             }
9696             ret = FALSE;
9697         }
9698         else {
9699             return FALSE;
9700         }
9701         nextchar(pRExC_state);
9702         return ret;
9703     }
9704
9705     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9706     RExC_parse += 2;    /* Skip past the 'U+' */
9707
9708     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9709
9710     /* Code points are separated by dots.  If none, there is only one code
9711      * point, and is terminated by the brace */
9712     has_multiple_chars = (endchar < endbrace);
9713
9714     if (valuep && (! has_multiple_chars || in_char_class)) {
9715         /* We only pay attention to the first char of
9716         multichar strings being returned in char classes. I kinda wonder
9717         if this makes sense as it does change the behaviour
9718         from earlier versions, OTOH that behaviour was broken
9719         as well. XXX Solution is to recharacterize as
9720         [rest-of-class]|multi1|multi2... */
9721
9722         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9723         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9724             | PERL_SCAN_DISALLOW_PREFIX
9725             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9726
9727         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9728
9729         /* The tokenizer should have guaranteed validity, but it's possible to
9730          * bypass it by using single quoting, so check */
9731         if (length_of_hex == 0
9732             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9733         {
9734             RExC_parse += length_of_hex;        /* Includes all the valid */
9735             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9736                             ? UTF8SKIP(RExC_parse)
9737                             : 1;
9738             /* Guard against malformed utf8 */
9739             if (RExC_parse >= endchar) {
9740                 RExC_parse = endchar;
9741             }
9742             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9743         }
9744
9745         if (in_char_class && has_multiple_chars) {
9746             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9747         }
9748         RExC_parse = endbrace + 1;
9749     }
9750     else if (! node_p || ! has_multiple_chars) {
9751
9752         /* Here, the input is legal, but not according to the caller's
9753          * options.  We fail without advancing the parse, so that the
9754          * caller can try again */
9755         RExC_parse = p;
9756         return FALSE;
9757     }
9758     else {
9759
9760         /* What is done here is to convert this to a sub-pattern of the form
9761          * (?:\x{char1}\x{char2}...)
9762          * and then call reg recursively.  That way, it retains its atomicness,
9763          * while not having to worry about special handling that some code
9764          * points may have.  toke.c has converted the original Unicode values
9765          * to native, so that we can just pass on the hex values unchanged.  We
9766          * do have to set a flag to keep recoding from happening in the
9767          * recursion */
9768
9769         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9770         STRLEN len;
9771         char *orig_end = RExC_end;
9772         I32 flags;
9773
9774         while (RExC_parse < endbrace) {
9775
9776             /* Convert to notation the rest of the code understands */
9777             sv_catpv(substitute_parse, "\\x{");
9778             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9779             sv_catpv(substitute_parse, "}");
9780
9781             /* Point to the beginning of the next character in the sequence. */
9782             RExC_parse = endchar + 1;
9783             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9784         }
9785         sv_catpv(substitute_parse, ")");
9786
9787         RExC_parse = SvPV(substitute_parse, len);
9788
9789         /* Don't allow empty number */
9790         if (len < 8) {
9791             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9792         }
9793         RExC_end = RExC_parse + len;
9794
9795         /* The values are Unicode, and therefore not subject to recoding */
9796         RExC_override_recoding = 1;
9797
9798         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9799         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9800
9801         RExC_parse = endbrace;
9802         RExC_end = orig_end;
9803         RExC_override_recoding = 0;
9804
9805         nextchar(pRExC_state);
9806     }
9807
9808     return TRUE;
9809 }
9810
9811
9812 /*
9813  * reg_recode
9814  *
9815  * It returns the code point in utf8 for the value in *encp.
9816  *    value: a code value in the source encoding
9817  *    encp:  a pointer to an Encode object
9818  *
9819  * If the result from Encode is not a single character,
9820  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9821  */
9822 STATIC UV
9823 S_reg_recode(pTHX_ const char value, SV **encp)
9824 {
9825     STRLEN numlen = 1;
9826     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9827     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9828     const STRLEN newlen = SvCUR(sv);
9829     UV uv = UNICODE_REPLACEMENT;
9830
9831     PERL_ARGS_ASSERT_REG_RECODE;
9832
9833     if (newlen)
9834         uv = SvUTF8(sv)
9835              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9836              : *(U8*)s;
9837
9838     if (!newlen || numlen != newlen) {
9839         uv = UNICODE_REPLACEMENT;
9840         *encp = NULL;
9841     }
9842     return uv;
9843 }
9844
9845 PERL_STATIC_INLINE U8
9846 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9847 {
9848     U8 op;
9849
9850     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9851
9852     if (! FOLD) {
9853         return EXACT;
9854     }
9855
9856     op = get_regex_charset(RExC_flags);
9857     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9858         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9859                  been, so there is no hole */
9860     }
9861
9862     return op + EXACTF;
9863 }
9864
9865 PERL_STATIC_INLINE void
9866 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9867 {
9868     /* This knows the details about sizing an EXACTish node, setting flags for
9869      * it (by setting <*flagp>, and potentially populating it with a single
9870      * character.
9871      *
9872      * If <len> is non-zero, this function assumes that the node has already
9873      * been populated, and just does the sizing.  In this case <code_point>
9874      * should be the final code point that has already been placed into the
9875      * node.  This value will be ignored except that under some circumstances
9876      * <*flagp> is set based on it.
9877      *
9878      * If <len is zero, the function assumes that the node is to contain only
9879      * the single character given by <code_point> and calculates what <len>
9880      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9881      * additionally will populate the node's STRING with <code_point>, if <len>
9882      * is 0.  In both cases <*flagp> is appropriately set
9883      *
9884      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9885      * folded (the latter only when the rules indicate it can match 'ss') */
9886
9887     bool len_passed_in = cBOOL(len != 0);
9888     U8 character[UTF8_MAXBYTES_CASE+1];
9889
9890     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9891
9892     if (! len_passed_in) {
9893         if (UTF) {
9894             if (FOLD) {
9895                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9896             }
9897             else {
9898                 uvchr_to_utf8( character, code_point);
9899                 len = UTF8SKIP(character);
9900             }
9901         }
9902         else if (! FOLD
9903                  || code_point != LATIN_SMALL_LETTER_SHARP_S
9904                  || ASCII_FOLD_RESTRICTED
9905                  || ! AT_LEAST_UNI_SEMANTICS)
9906         {
9907             *character = (U8) code_point;
9908             len = 1;
9909         }
9910         else {
9911             *character = 's';
9912             *(character + 1) = 's';
9913             len = 2;
9914         }
9915     }
9916
9917     if (SIZE_ONLY) {
9918         RExC_size += STR_SZ(len);
9919     }
9920     else {
9921         RExC_emit += STR_SZ(len);
9922         STR_LEN(node) = len;
9923         if (! len_passed_in) {
9924             Copy((char *) character, STRING(node), len, char);
9925         }
9926     }
9927
9928     *flagp |= HASWIDTH;
9929     if (len == 1 && UNI_IS_INVARIANT(code_point))
9930         *flagp |= SIMPLE;
9931 }
9932
9933 /*
9934  - regatom - the lowest level
9935
9936    Try to identify anything special at the start of the pattern. If there
9937    is, then handle it as required. This may involve generating a single regop,
9938    such as for an assertion; or it may involve recursing, such as to
9939    handle a () structure.
9940
9941    If the string doesn't start with something special then we gobble up
9942    as much literal text as we can.
9943
9944    Once we have been able to handle whatever type of thing started the
9945    sequence, we return.
9946
9947    Note: we have to be careful with escapes, as they can be both literal
9948    and special, and in the case of \10 and friends, context determines which.
9949
9950    A summary of the code structure is:
9951
9952    switch (first_byte) {
9953         cases for each special:
9954             handle this special;
9955             break;
9956         case '\\':
9957             switch (2nd byte) {
9958                 cases for each unambiguous special:
9959                     handle this special;
9960                     break;
9961                 cases for each ambigous special/literal:
9962                     disambiguate;
9963                     if (special)  handle here
9964                     else goto defchar;
9965                 default: // unambiguously literal:
9966                     goto defchar;
9967             }
9968         default:  // is a literal char
9969             // FALL THROUGH
9970         defchar:
9971             create EXACTish node for literal;
9972             while (more input and node isn't full) {
9973                 switch (input_byte) {
9974                    cases for each special;
9975                        make sure parse pointer is set so that the next call to
9976                            regatom will see this special first
9977                        goto loopdone; // EXACTish node terminated by prev. char
9978                    default:
9979                        append char to EXACTISH node;
9980                 }
9981                 get next input byte;
9982             }
9983         loopdone:
9984    }
9985    return the generated node;
9986
9987    Specifically there are two separate switches for handling
9988    escape sequences, with the one for handling literal escapes requiring
9989    a dummy entry for all of the special escapes that are actually handled
9990    by the other.
9991 */
9992
9993 STATIC regnode *
9994 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9995 {
9996     dVAR;
9997     regnode *ret = NULL;
9998     I32 flags;
9999     char *parse_start = RExC_parse;
10000     U8 op;
10001     GET_RE_DEBUG_FLAGS_DECL;
10002     DEBUG_PARSE("atom");
10003     *flagp = WORST;             /* Tentatively. */
10004
10005     PERL_ARGS_ASSERT_REGATOM;
10006
10007 tryagain:
10008     switch ((U8)*RExC_parse) {
10009     case '^':
10010         RExC_seen_zerolen++;
10011         nextchar(pRExC_state);
10012         if (RExC_flags & RXf_PMf_MULTILINE)
10013             ret = reg_node(pRExC_state, MBOL);
10014         else if (RExC_flags & RXf_PMf_SINGLELINE)
10015             ret = reg_node(pRExC_state, SBOL);
10016         else
10017             ret = reg_node(pRExC_state, BOL);
10018         Set_Node_Length(ret, 1); /* MJD */
10019         break;
10020     case '$':
10021         nextchar(pRExC_state);
10022         if (*RExC_parse)
10023             RExC_seen_zerolen++;
10024         if (RExC_flags & RXf_PMf_MULTILINE)
10025             ret = reg_node(pRExC_state, MEOL);
10026         else if (RExC_flags & RXf_PMf_SINGLELINE)
10027             ret = reg_node(pRExC_state, SEOL);
10028         else
10029             ret = reg_node(pRExC_state, EOL);
10030         Set_Node_Length(ret, 1); /* MJD */
10031         break;
10032     case '.':
10033         nextchar(pRExC_state);
10034         if (RExC_flags & RXf_PMf_SINGLELINE)
10035             ret = reg_node(pRExC_state, SANY);
10036         else
10037             ret = reg_node(pRExC_state, REG_ANY);
10038         *flagp |= HASWIDTH|SIMPLE;
10039         RExC_naughty++;
10040         Set_Node_Length(ret, 1); /* MJD */
10041         break;
10042     case '[':
10043     {
10044         char * const oregcomp_parse = ++RExC_parse;
10045         ret = regclass(pRExC_state, flagp,depth+1);
10046         if (*RExC_parse != ']') {
10047             RExC_parse = oregcomp_parse;
10048             vFAIL("Unmatched [");
10049         }
10050         nextchar(pRExC_state);
10051         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10052         break;
10053     }
10054     case '(':
10055         nextchar(pRExC_state);
10056         ret = reg(pRExC_state, 1, &flags,depth+1);
10057         if (ret == NULL) {
10058                 if (flags & TRYAGAIN) {
10059                     if (RExC_parse == RExC_end) {
10060                          /* Make parent create an empty node if needed. */
10061                         *flagp |= TRYAGAIN;
10062                         return(NULL);
10063                     }
10064                     goto tryagain;
10065                 }
10066                 return(NULL);
10067         }
10068         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10069         break;
10070     case '|':
10071     case ')':
10072         if (flags & TRYAGAIN) {
10073             *flagp |= TRYAGAIN;
10074             return NULL;
10075         }
10076         vFAIL("Internal urp");
10077                                 /* Supposed to be caught earlier. */
10078         break;
10079     case '?':
10080     case '+':
10081     case '*':
10082         RExC_parse++;
10083         vFAIL("Quantifier follows nothing");
10084         break;
10085     case '\\':
10086         /* Special Escapes
10087
10088            This switch handles escape sequences that resolve to some kind
10089            of special regop and not to literal text. Escape sequnces that
10090            resolve to literal text are handled below in the switch marked
10091            "Literal Escapes".
10092
10093            Every entry in this switch *must* have a corresponding entry
10094            in the literal escape switch. However, the opposite is not
10095            required, as the default for this switch is to jump to the
10096            literal text handling code.
10097         */
10098         switch ((U8)*++RExC_parse) {
10099         /* Special Escapes */
10100         case 'A':
10101             RExC_seen_zerolen++;
10102             ret = reg_node(pRExC_state, SBOL);
10103             *flagp |= SIMPLE;
10104             goto finish_meta_pat;
10105         case 'G':
10106             ret = reg_node(pRExC_state, GPOS);
10107             RExC_seen |= REG_SEEN_GPOS;
10108             *flagp |= SIMPLE;
10109             goto finish_meta_pat;
10110         case 'K':
10111             RExC_seen_zerolen++;
10112             ret = reg_node(pRExC_state, KEEPS);
10113             *flagp |= SIMPLE;
10114             /* XXX:dmq : disabling in-place substitution seems to
10115              * be necessary here to avoid cases of memory corruption, as
10116              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10117              */
10118             RExC_seen |= REG_SEEN_LOOKBEHIND;
10119             goto finish_meta_pat;
10120         case 'Z':
10121             ret = reg_node(pRExC_state, SEOL);
10122             *flagp |= SIMPLE;
10123             RExC_seen_zerolen++;                /* Do not optimize RE away */
10124             goto finish_meta_pat;
10125         case 'z':
10126             ret = reg_node(pRExC_state, EOS);
10127             *flagp |= SIMPLE;
10128             RExC_seen_zerolen++;                /* Do not optimize RE away */
10129             goto finish_meta_pat;
10130         case 'C':
10131             ret = reg_node(pRExC_state, CANY);
10132             RExC_seen |= REG_SEEN_CANY;
10133             *flagp |= HASWIDTH|SIMPLE;
10134             goto finish_meta_pat;
10135         case 'X':
10136             ret = reg_node(pRExC_state, CLUMP);
10137             *flagp |= HASWIDTH;
10138             goto finish_meta_pat;
10139         case 'w':
10140             op = ALNUM + get_regex_charset(RExC_flags);
10141             if (op > ALNUMA) {  /* /aa is same as /a */
10142                 op = ALNUMA;
10143             }
10144             ret = reg_node(pRExC_state, op);
10145             *flagp |= HASWIDTH|SIMPLE;
10146             goto finish_meta_pat;
10147         case 'W':
10148             op = NALNUM + get_regex_charset(RExC_flags);
10149             if (op > NALNUMA) { /* /aa is same as /a */
10150                 op = NALNUMA;
10151             }
10152             ret = reg_node(pRExC_state, op);
10153             *flagp |= HASWIDTH|SIMPLE;
10154             goto finish_meta_pat;
10155         case 'b':
10156             RExC_seen_zerolen++;
10157             RExC_seen |= REG_SEEN_LOOKBEHIND;
10158             op = BOUND + get_regex_charset(RExC_flags);
10159             if (op > BOUNDA) {  /* /aa is same as /a */
10160                 op = BOUNDA;
10161             }
10162             ret = reg_node(pRExC_state, op);
10163             FLAGS(ret) = get_regex_charset(RExC_flags);
10164             *flagp |= SIMPLE;
10165             goto finish_meta_pat;
10166         case 'B':
10167             RExC_seen_zerolen++;
10168             RExC_seen |= REG_SEEN_LOOKBEHIND;
10169             op = NBOUND + get_regex_charset(RExC_flags);
10170             if (op > NBOUNDA) { /* /aa is same as /a */
10171                 op = NBOUNDA;
10172             }
10173             ret = reg_node(pRExC_state, op);
10174             FLAGS(ret) = get_regex_charset(RExC_flags);
10175             *flagp |= SIMPLE;
10176             goto finish_meta_pat;
10177         case 's':
10178             op = SPACE + get_regex_charset(RExC_flags);
10179             if (op > SPACEA) {  /* /aa is same as /a */
10180                 op = SPACEA;
10181             }
10182             ret = reg_node(pRExC_state, op);
10183             *flagp |= HASWIDTH|SIMPLE;
10184             goto finish_meta_pat;
10185         case 'S':
10186             op = NSPACE + get_regex_charset(RExC_flags);
10187             if (op > NSPACEA) { /* /aa is same as /a */
10188                 op = NSPACEA;
10189             }
10190             ret = reg_node(pRExC_state, op);
10191             *flagp |= HASWIDTH|SIMPLE;
10192             goto finish_meta_pat;
10193         case 'D':
10194             op = NDIGIT;
10195             goto join_D_and_d;
10196         case 'd':
10197             op = DIGIT;
10198         join_D_and_d:
10199             {
10200                 U8 offset = get_regex_charset(RExC_flags);
10201                 if (offset == REGEX_UNICODE_CHARSET) {
10202                     offset = REGEX_DEPENDS_CHARSET;
10203                 }
10204                 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10205                     offset = REGEX_ASCII_RESTRICTED_CHARSET;
10206                 }
10207                 op += offset;
10208             }
10209             ret = reg_node(pRExC_state, op);
10210             *flagp |= HASWIDTH|SIMPLE;
10211             goto finish_meta_pat;
10212         case 'R':
10213             ret = reg_node(pRExC_state, LNBREAK);
10214             *flagp |= HASWIDTH|SIMPLE;
10215             goto finish_meta_pat;
10216         case 'h':
10217             ret = reg_node(pRExC_state, HORIZWS);
10218             *flagp |= HASWIDTH|SIMPLE;
10219             goto finish_meta_pat;
10220         case 'H':
10221             ret = reg_node(pRExC_state, NHORIZWS);
10222             *flagp |= HASWIDTH|SIMPLE;
10223             goto finish_meta_pat;
10224         case 'v':
10225             ret = reg_node(pRExC_state, VERTWS);
10226             *flagp |= HASWIDTH|SIMPLE;
10227             goto finish_meta_pat;
10228         case 'V':
10229             ret = reg_node(pRExC_state, NVERTWS);
10230             *flagp |= HASWIDTH|SIMPLE;
10231          finish_meta_pat:
10232             nextchar(pRExC_state);
10233             Set_Node_Length(ret, 2); /* MJD */
10234             break;
10235         case 'p':
10236         case 'P':
10237             {
10238                 char* const oldregxend = RExC_end;
10239 #ifdef DEBUGGING
10240                 char* parse_start = RExC_parse - 2;
10241 #endif
10242
10243                 if (RExC_parse[1] == '{') {
10244                   /* a lovely hack--pretend we saw [\pX] instead */
10245                     RExC_end = strchr(RExC_parse, '}');
10246                     if (!RExC_end) {
10247                         const U8 c = (U8)*RExC_parse;
10248                         RExC_parse += 2;
10249                         RExC_end = oldregxend;
10250                         vFAIL2("Missing right brace on \\%c{}", c);
10251                     }
10252                     RExC_end++;
10253                 }
10254                 else {
10255                     RExC_end = RExC_parse + 2;
10256                     if (RExC_end > oldregxend)
10257                         RExC_end = oldregxend;
10258                 }
10259                 RExC_parse--;
10260
10261                 ret = regclass(pRExC_state, flagp,depth+1);
10262
10263                 RExC_end = oldregxend;
10264                 RExC_parse--;
10265
10266                 Set_Node_Offset(ret, parse_start + 2);
10267                 Set_Node_Cur_Length(ret);
10268                 nextchar(pRExC_state);
10269             }
10270             break;
10271         case 'N':
10272             /* Handle \N and \N{NAME} with multiple code points here and not
10273              * below because it can be multicharacter. join_exact() will join
10274              * them up later on.  Also this makes sure that things like
10275              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10276              * The options to the grok function call causes it to fail if the
10277              * sequence is just a single code point.  We then go treat it as
10278              * just another character in the current EXACT node, and hence it
10279              * gets uniform treatment with all the other characters.  The
10280              * special treatment for quantifiers is not needed for such single
10281              * character sequences */
10282             ++RExC_parse;
10283             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10284                 RExC_parse--;
10285                 goto defchar;
10286             }
10287             break;
10288         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10289         parse_named_seq:
10290         {
10291             char ch= RExC_parse[1];
10292             if (ch != '<' && ch != '\'' && ch != '{') {
10293                 RExC_parse++;
10294                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10295             } else {
10296                 /* this pretty much dupes the code for (?P=...) in reg(), if
10297                    you change this make sure you change that */
10298                 char* name_start = (RExC_parse += 2);
10299                 U32 num = 0;
10300                 SV *sv_dat = reg_scan_name(pRExC_state,
10301                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10302                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10303                 if (RExC_parse == name_start || *RExC_parse != ch)
10304                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10305
10306                 if (!SIZE_ONLY) {
10307                     num = add_data( pRExC_state, 1, "S" );
10308                     RExC_rxi->data->data[num]=(void*)sv_dat;
10309                     SvREFCNT_inc_simple_void(sv_dat);
10310                 }
10311
10312                 RExC_sawback = 1;
10313                 ret = reganode(pRExC_state,
10314                                ((! FOLD)
10315                                  ? NREF
10316                                  : (ASCII_FOLD_RESTRICTED)
10317                                    ? NREFFA
10318                                    : (AT_LEAST_UNI_SEMANTICS)
10319                                      ? NREFFU
10320                                      : (LOC)
10321                                        ? NREFFL
10322                                        : NREFF),
10323                                 num);
10324                 *flagp |= HASWIDTH;
10325
10326                 /* override incorrect value set in reganode MJD */
10327                 Set_Node_Offset(ret, parse_start+1);
10328                 Set_Node_Cur_Length(ret); /* MJD */
10329                 nextchar(pRExC_state);
10330
10331             }
10332             break;
10333         }
10334         case 'g':
10335         case '1': case '2': case '3': case '4':
10336         case '5': case '6': case '7': case '8': case '9':
10337             {
10338                 I32 num;
10339                 bool isg = *RExC_parse == 'g';
10340                 bool isrel = 0;
10341                 bool hasbrace = 0;
10342                 if (isg) {
10343                     RExC_parse++;
10344                     if (*RExC_parse == '{') {
10345                         RExC_parse++;
10346                         hasbrace = 1;
10347                     }
10348                     if (*RExC_parse == '-') {
10349                         RExC_parse++;
10350                         isrel = 1;
10351                     }
10352                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10353                         if (isrel) RExC_parse--;
10354                         RExC_parse -= 2;
10355                         goto parse_named_seq;
10356                 }   }
10357                 num = atoi(RExC_parse);
10358                 if (isg && num == 0)
10359                     vFAIL("Reference to invalid group 0");
10360                 if (isrel) {
10361                     num = RExC_npar - num;
10362                     if (num < 1)
10363                         vFAIL("Reference to nonexistent or unclosed group");
10364                 }
10365                 if (!isg && num > 9 && num >= RExC_npar)
10366                     /* Probably a character specified in octal, e.g. \35 */
10367                     goto defchar;
10368                 else {
10369                     char * const parse_start = RExC_parse - 1; /* MJD */
10370                     while (isDIGIT(*RExC_parse))
10371                         RExC_parse++;
10372                     if (parse_start == RExC_parse - 1)
10373                         vFAIL("Unterminated \\g... pattern");
10374                     if (hasbrace) {
10375                         if (*RExC_parse != '}')
10376                             vFAIL("Unterminated \\g{...} pattern");
10377                         RExC_parse++;
10378                     }
10379                     if (!SIZE_ONLY) {
10380                         if (num > (I32)RExC_rx->nparens)
10381                             vFAIL("Reference to nonexistent group");
10382                     }
10383                     RExC_sawback = 1;
10384                     ret = reganode(pRExC_state,
10385                                    ((! FOLD)
10386                                      ? REF
10387                                      : (ASCII_FOLD_RESTRICTED)
10388                                        ? REFFA
10389                                        : (AT_LEAST_UNI_SEMANTICS)
10390                                          ? REFFU
10391                                          : (LOC)
10392                                            ? REFFL
10393                                            : REFF),
10394                                     num);
10395                     *flagp |= HASWIDTH;
10396
10397                     /* override incorrect value set in reganode MJD */
10398                     Set_Node_Offset(ret, parse_start+1);
10399                     Set_Node_Cur_Length(ret); /* MJD */
10400                     RExC_parse--;
10401                     nextchar(pRExC_state);
10402                 }
10403             }
10404             break;
10405         case '\0':
10406             if (RExC_parse >= RExC_end)
10407                 FAIL("Trailing \\");
10408             /* FALL THROUGH */
10409         default:
10410             /* Do not generate "unrecognized" warnings here, we fall
10411                back into the quick-grab loop below */
10412             parse_start--;
10413             goto defchar;
10414         }
10415         break;
10416
10417     case '#':
10418         if (RExC_flags & RXf_PMf_EXTENDED) {
10419             if ( reg_skipcomment( pRExC_state ) )
10420                 goto tryagain;
10421         }
10422         /* FALL THROUGH */
10423
10424     default:
10425
10426             parse_start = RExC_parse - 1;
10427
10428             RExC_parse++;
10429
10430         defchar: {
10431             STRLEN len = 0;
10432             UV ender;
10433             char *p;
10434             char *s;
10435 #define MAX_NODE_STRING_SIZE 127
10436             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10437             char *s0;
10438             U8 upper_parse = MAX_NODE_STRING_SIZE;
10439             STRLEN foldlen;
10440             U8 node_type;
10441             bool next_is_quantifier;
10442             char * oldp;
10443
10444             ender = 0;
10445             node_type = compute_EXACTish(pRExC_state);
10446             ret = reg_node(pRExC_state, node_type);
10447
10448             /* In pass1, folded, we use a temporary buffer instead of the
10449              * actual node, as the node doesn't exist yet */
10450             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10451
10452             s0 = s;
10453
10454         reparse:
10455
10456             /* XXX The node can hold up to 255 bytes, yet this only goes to
10457              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10458              * 255 allows us to not have to worry about overflow due to
10459              * converting to utf8 and fold expansion, but that value is
10460              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10461              * split up by this limit into a single one using the real max of
10462              * 255.  Even at 127, this breaks under rare circumstances.  If
10463              * folding, we do not want to split a node at a character that is a
10464              * non-final in a multi-char fold, as an input string could just
10465              * happen to want to match across the node boundary.  The join
10466              * would solve that problem if the join actually happens.  But a
10467              * series of more than two nodes in a row each of 127 would cause
10468              * the first join to succeed to get to 254, but then there wouldn't
10469              * be room for the next one, which could at be one of those split
10470              * multi-char folds.  I don't know of any fool-proof solution.  One
10471              * could back off to end with only a code point that isn't such a
10472              * non-final, but it is possible for there not to be any in the
10473              * entire node. */
10474             for (p = RExC_parse - 1;
10475                  len < upper_parse && p < RExC_end;
10476                  len++)
10477             {
10478                 oldp = p;
10479
10480                 if (RExC_flags & RXf_PMf_EXTENDED)
10481                     p = regwhite( pRExC_state, p );
10482                 switch ((U8)*p) {
10483                 case '^':
10484                 case '$':
10485                 case '.':
10486                 case '[':
10487                 case '(':
10488                 case ')':
10489                 case '|':
10490                     goto loopdone;
10491                 case '\\':
10492                     /* Literal Escapes Switch
10493
10494                        This switch is meant to handle escape sequences that
10495                        resolve to a literal character.
10496
10497                        Every escape sequence that represents something
10498                        else, like an assertion or a char class, is handled
10499                        in the switch marked 'Special Escapes' above in this
10500                        routine, but also has an entry here as anything that
10501                        isn't explicitly mentioned here will be treated as
10502                        an unescaped equivalent literal.
10503                     */
10504
10505                     switch ((U8)*++p) {
10506                     /* These are all the special escapes. */
10507                     case 'A':             /* Start assertion */
10508                     case 'b': case 'B':   /* Word-boundary assertion*/
10509                     case 'C':             /* Single char !DANGEROUS! */
10510                     case 'd': case 'D':   /* digit class */
10511                     case 'g': case 'G':   /* generic-backref, pos assertion */
10512                     case 'h': case 'H':   /* HORIZWS */
10513                     case 'k': case 'K':   /* named backref, keep marker */
10514                     case 'p': case 'P':   /* Unicode property */
10515                               case 'R':   /* LNBREAK */
10516                     case 's': case 'S':   /* space class */
10517                     case 'v': case 'V':   /* VERTWS */
10518                     case 'w': case 'W':   /* word class */
10519                     case 'X':             /* eXtended Unicode "combining character sequence" */
10520                     case 'z': case 'Z':   /* End of line/string assertion */
10521                         --p;
10522                         goto loopdone;
10523
10524                     /* Anything after here is an escape that resolves to a
10525                        literal. (Except digits, which may or may not)
10526                      */
10527                     case 'n':
10528                         ender = '\n';
10529                         p++;
10530                         break;
10531                     case 'N': /* Handle a single-code point named character. */
10532                         /* The options cause it to fail if a multiple code
10533                          * point sequence.  Handle those in the switch() above
10534                          * */
10535                         RExC_parse = p + 1;
10536                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10537                                             flagp, depth, FALSE))
10538                         {
10539                             RExC_parse = p = oldp;
10540                             goto loopdone;
10541                         }
10542                         p = RExC_parse;
10543                         if (ender > 0xff) {
10544                             REQUIRE_UTF8;
10545                         }
10546                         break;
10547                     case 'r':
10548                         ender = '\r';
10549                         p++;
10550                         break;
10551                     case 't':
10552                         ender = '\t';
10553                         p++;
10554                         break;
10555                     case 'f':
10556                         ender = '\f';
10557                         p++;
10558                         break;
10559                     case 'e':
10560                           ender = ASCII_TO_NATIVE('\033');
10561                         p++;
10562                         break;
10563                     case 'a':
10564                           ender = ASCII_TO_NATIVE('\007');
10565                         p++;
10566                         break;
10567                     case 'o':
10568                         {
10569                             STRLEN brace_len = len;
10570                             UV result;
10571                             const char* error_msg;
10572
10573                             bool valid = grok_bslash_o(p,
10574                                                        &result,
10575                                                        &brace_len,
10576                                                        &error_msg,
10577                                                        1);
10578                             p += brace_len;
10579                             if (! valid) {
10580                                 RExC_parse = p; /* going to die anyway; point
10581                                                    to exact spot of failure */
10582                                 vFAIL(error_msg);
10583                             }
10584                             else
10585                             {
10586                                 ender = result;
10587                             }
10588                             if (PL_encoding && ender < 0x100) {
10589                                 goto recode_encoding;
10590                             }
10591                             if (ender > 0xff) {
10592                                 REQUIRE_UTF8;
10593                             }
10594                             break;
10595                         }
10596                     case 'x':
10597                         {
10598                             STRLEN brace_len = len;
10599                             UV result;
10600                             const char* error_msg;
10601
10602                             bool valid = grok_bslash_x(p,
10603                                                        &result,
10604                                                        &brace_len,
10605                                                        &error_msg,
10606                                                        1);
10607                             p += brace_len;
10608                             if (! valid) {
10609                                 RExC_parse = p; /* going to die anyway; point
10610                                                    to exact spot of failure */
10611                                 vFAIL(error_msg);
10612                             }
10613                             else {
10614                                 ender = result;
10615                             }
10616                             if (PL_encoding && ender < 0x100) {
10617                                 goto recode_encoding;
10618                             }
10619                             if (ender > 0xff) {
10620                                 REQUIRE_UTF8;
10621                             }
10622                             break;
10623                         }
10624                     case 'c':
10625                         p++;
10626                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10627                         break;
10628                     case '0': case '1': case '2': case '3':case '4':
10629                     case '5': case '6': case '7':
10630                         if (*p == '0' ||
10631                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10632                         {
10633                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10634                             STRLEN numlen = 3;
10635                             ender = grok_oct(p, &numlen, &flags, NULL);
10636                             if (ender > 0xff) {
10637                                 REQUIRE_UTF8;
10638                             }
10639                             p += numlen;
10640                         }
10641                         else {
10642                             --p;
10643                             goto loopdone;
10644                         }
10645                         if (PL_encoding && ender < 0x100)
10646                             goto recode_encoding;
10647                         break;
10648                     recode_encoding:
10649                         if (! RExC_override_recoding) {
10650                             SV* enc = PL_encoding;
10651                             ender = reg_recode((const char)(U8)ender, &enc);
10652                             if (!enc && SIZE_ONLY)
10653                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10654                             REQUIRE_UTF8;
10655                         }
10656                         break;
10657                     case '\0':
10658                         if (p >= RExC_end)
10659                             FAIL("Trailing \\");
10660                         /* FALL THROUGH */
10661                     default:
10662                         if (!SIZE_ONLY&& isALNUMC(*p)) {
10663                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10664                         }
10665                         goto normal_default;
10666                     }
10667                     break;
10668                 case '{':
10669                     /* Currently we don't warn when the lbrace is at the start
10670                      * of a construct.  This catches it in the middle of a
10671                      * literal string, or when its the first thing after
10672                      * something like "\b" */
10673                     if (! SIZE_ONLY
10674                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10675                     {
10676                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10677                     }
10678                     /*FALLTHROUGH*/
10679                 default:
10680                   normal_default:
10681                     if (UTF8_IS_START(*p) && UTF) {
10682                         STRLEN numlen;
10683                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10684                                                &numlen, UTF8_ALLOW_DEFAULT);
10685                         p += numlen;
10686                     }
10687                     else
10688                         ender = (U8) *p++;
10689                     break;
10690                 } /* End of switch on the literal */
10691
10692                 /* Here, have looked at the literal character and <ender>
10693                  * contains its ordinal, <p> points to the character after it
10694                  */
10695
10696                 if ( RExC_flags & RXf_PMf_EXTENDED)
10697                     p = regwhite( pRExC_state, p );
10698
10699                 /* If the next thing is a quantifier, it applies to this
10700                  * character only, which means that this character has to be in
10701                  * its own node and can't just be appended to the string in an
10702                  * existing node, so if there are already other characters in
10703                  * the node, close the node with just them, and set up to do
10704                  * this character again next time through, when it will be the
10705                  * only thing in its new node */
10706                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10707                 {
10708                     p = oldp;
10709                     goto loopdone;
10710                 }
10711
10712                 if (FOLD) {
10713                     if (UTF
10714                             /* See comments for join_exact() as to why we fold
10715                              * this non-UTF at compile time */
10716                         || (node_type == EXACTFU
10717                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10718                     {
10719
10720
10721                         /* Prime the casefolded buffer.  Locale rules, which
10722                          * apply only to code points < 256, aren't known until
10723                          * execution, so for them, just output the original
10724                          * character using utf8.  If we start to fold non-UTF
10725                          * patterns, be sure to update join_exact() */
10726                         if (LOC && ender < 256) {
10727                             if (UNI_IS_INVARIANT(ender)) {
10728                                 *s = (U8) ender;
10729                                 foldlen = 1;
10730                             } else {
10731                                 *s = UTF8_TWO_BYTE_HI(ender);
10732                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10733                                 foldlen = 2;
10734                             }
10735                         }
10736                         else {
10737                             ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
10738                                     FOLD_FLAGS_FULL
10739                                      | ((LOC) ?  FOLD_FLAGS_LOCALE
10740                                               : (ASCII_FOLD_RESTRICTED)
10741                                                 ? FOLD_FLAGS_NOMIX_ASCII
10742                                                 : 0)
10743                                 );
10744                         }
10745                         s += foldlen;
10746
10747                         /* The loop increments <len> each time, as all but this
10748                          * path (and the one just below for UTF) through it add
10749                          * a single byte to the EXACTish node.  But this one
10750                          * has changed len to be the correct final value, so
10751                          * subtract one to cancel out the increment that
10752                          * follows */
10753                         len += foldlen - 1;
10754                     }
10755                     else {
10756                         *(s++) = ender;
10757                     }
10758                 }
10759                 else if (UTF) {
10760                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10761                     if (unilen > 0) {
10762                        s   += unilen;
10763                        len += unilen;
10764                     }
10765
10766                     /* See comment just above for - 1 */
10767                     len--;
10768                 }
10769                 else {
10770                     REGC((char)ender, s++);
10771                 }
10772
10773                 if (next_is_quantifier) {
10774
10775                     /* Here, the next input is a quantifier, and to get here,
10776                      * the current character is the only one in the node.
10777                      * Also, here <len> doesn't include the final byte for this
10778                      * character */
10779                     len++;
10780                     goto loopdone;
10781                 }
10782
10783             } /* End of loop through literal characters */
10784
10785             /* Here we have either exhausted the input or ran out of room in
10786              * the node.  (If we encountered a character that can't be in the
10787              * node, transfer is made directly to <loopdone>, and so we
10788              * wouldn't have fallen off the end of the loop.)  In the latter
10789              * case, we artificially have to split the node into two, because
10790              * we just don't have enough space to hold everything.  This
10791              * creates a problem if the final character participates in a
10792              * multi-character fold in the non-final position, as a match that
10793              * should have occurred won't, due to the way nodes are matched,
10794              * and our artificial boundary.  So back off until we find a non-
10795              * problematic character -- one that isn't at the beginning or
10796              * middle of such a fold.  (Either it doesn't participate in any
10797              * folds, or appears only in the final position of all the folds it
10798              * does participate in.)  A better solution with far fewer false
10799              * positives, and that would fill the nodes more completely, would
10800              * be to actually have available all the multi-character folds to
10801              * test against, and to back-off only far enough to be sure that
10802              * this node isn't ending with a partial one.  <upper_parse> is set
10803              * further below (if we need to reparse the node) to include just
10804              * up through that final non-problematic character that this code
10805              * identifies, so when it is set to less than the full node, we can
10806              * skip the rest of this */
10807             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10808
10809                 const STRLEN full_len = len;
10810
10811                 assert(len >= MAX_NODE_STRING_SIZE);
10812
10813                 /* Here, <s> points to the final byte of the final character.
10814                  * Look backwards through the string until find a non-
10815                  * problematic character */
10816
10817                 if (! UTF) {
10818
10819                     /* These two have no multi-char folds to non-UTF characters
10820                      */
10821                     if (ASCII_FOLD_RESTRICTED || LOC) {
10822                         goto loopdone;
10823                     }
10824
10825                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10826                     len = s - s0 + 1;
10827                 }
10828                 else {
10829                     if (!  PL_NonL1NonFinalFold) {
10830                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10831                                         NonL1_Perl_Non_Final_Folds_invlist);
10832                     }
10833
10834                     /* Point to the first byte of the final character */
10835                     s = (char *) utf8_hop((U8 *) s, -1);
10836
10837                     while (s >= s0) {   /* Search backwards until find
10838                                            non-problematic char */
10839                         if (UTF8_IS_INVARIANT(*s)) {
10840
10841                             /* There are no ascii characters that participate
10842                              * in multi-char folds under /aa.  In EBCDIC, the
10843                              * non-ascii invariants are all control characters,
10844                              * so don't ever participate in any folds. */
10845                             if (ASCII_FOLD_RESTRICTED
10846                                 || ! IS_NON_FINAL_FOLD(*s))
10847                             {
10848                                 break;
10849                             }
10850                         }
10851                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10852
10853                             /* No Latin1 characters participate in multi-char
10854                              * folds under /l */
10855                             if (LOC
10856                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10857                                                                 *s, *(s+1))))
10858                             {
10859                                 break;
10860                             }
10861                         }
10862                         else if (! _invlist_contains_cp(
10863                                         PL_NonL1NonFinalFold,
10864                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
10865                         {
10866                             break;
10867                         }
10868
10869                         /* Here, the current character is problematic in that
10870                          * it does occur in the non-final position of some
10871                          * fold, so try the character before it, but have to
10872                          * special case the very first byte in the string, so
10873                          * we don't read outside the string */
10874                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10875                     } /* End of loop backwards through the string */
10876
10877                     /* If there were only problematic characters in the string,
10878                      * <s> will point to before s0, in which case the length
10879                      * should be 0, otherwise include the length of the
10880                      * non-problematic character just found */
10881                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10882                 }
10883
10884                 /* Here, have found the final character, if any, that is
10885                  * non-problematic as far as ending the node without splitting
10886                  * it across a potential multi-char fold.  <len> contains the
10887                  * number of bytes in the node up-to and including that
10888                  * character, or is 0 if there is no such character, meaning
10889                  * the whole node contains only problematic characters.  In
10890                  * this case, give up and just take the node as-is.  We can't
10891                  * do any better */
10892                 if (len == 0) {
10893                     len = full_len;
10894                 } else {
10895
10896                     /* Here, the node does contain some characters that aren't
10897                      * problematic.  If one such is the final character in the
10898                      * node, we are done */
10899                     if (len == full_len) {
10900                         goto loopdone;
10901                     }
10902                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
10903
10904                         /* If the final character is problematic, but the
10905                          * penultimate is not, back-off that last character to
10906                          * later start a new node with it */
10907                         p = oldp;
10908                         goto loopdone;
10909                     }
10910
10911                     /* Here, the final non-problematic character is earlier
10912                      * in the input than the penultimate character.  What we do
10913                      * is reparse from the beginning, going up only as far as
10914                      * this final ok one, thus guaranteeing that the node ends
10915                      * in an acceptable character.  The reason we reparse is
10916                      * that we know how far in the character is, but we don't
10917                      * know how to correlate its position with the input parse.
10918                      * An alternate implementation would be to build that
10919                      * correlation as we go along during the original parse,
10920                      * but that would entail extra work for every node, whereas
10921                      * this code gets executed only when the string is too
10922                      * large for the node, and the final two characters are
10923                      * problematic, an infrequent occurrence.  Yet another
10924                      * possible strategy would be to save the tail of the
10925                      * string, and the next time regatom is called, initialize
10926                      * with that.  The problem with this is that unless you
10927                      * back off one more character, you won't be guaranteed
10928                      * regatom will get called again, unless regbranch,
10929                      * regpiece ... are also changed.  If you do back off that
10930                      * extra character, so that there is input guaranteed to
10931                      * force calling regatom, you can't handle the case where
10932                      * just the first character in the node is acceptable.  I
10933                      * (khw) decided to try this method which doesn't have that
10934                      * pitfall; if performance issues are found, we can do a
10935                      * combination of the current approach plus that one */
10936                     upper_parse = len;
10937                     len = 0;
10938                     s = s0;
10939                     goto reparse;
10940                 }
10941             }   /* End of verifying node ends with an appropriate char */
10942
10943         loopdone:   /* Jumped to when encounters something that shouldn't be in
10944                        the node */
10945
10946             /* I (khw) don't know if you can get here with zero length, but the
10947              * old code handled this situation by creating a zero-length EXACT
10948              * node.  Might as well be NOTHING instead */
10949             if (len == 0) {
10950                 OP(ret) = NOTHING;
10951             }
10952             else{
10953                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
10954             }
10955
10956             RExC_parse = p - 1;
10957             Set_Node_Cur_Length(ret); /* MJD */
10958             nextchar(pRExC_state);
10959             {
10960                 /* len is STRLEN which is unsigned, need to copy to signed */
10961                 IV iv = len;
10962                 if (iv < 0)
10963                     vFAIL("Internal disaster");
10964             }
10965
10966         } /* End of label 'defchar:' */
10967         break;
10968     } /* End of giant switch on input character */
10969
10970     return(ret);
10971 }
10972
10973 STATIC char *
10974 S_regwhite( RExC_state_t *pRExC_state, char *p )
10975 {
10976     const char *e = RExC_end;
10977
10978     PERL_ARGS_ASSERT_REGWHITE;
10979
10980     while (p < e) {
10981         if (isSPACE(*p))
10982             ++p;
10983         else if (*p == '#') {
10984             bool ended = 0;
10985             do {
10986                 if (*p++ == '\n') {
10987                     ended = 1;
10988                     break;
10989                 }
10990             } while (p < e);
10991             if (!ended)
10992                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10993         }
10994         else
10995             break;
10996     }
10997     return p;
10998 }
10999
11000 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11001    Character classes ([:foo:]) can also be negated ([:^foo:]).
11002    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11003    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11004    but trigger failures because they are currently unimplemented. */
11005
11006 #define POSIXCC_DONE(c)   ((c) == ':')
11007 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11008 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11009
11010 STATIC I32
11011 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11012 {
11013     dVAR;
11014     I32 namedclass = OOB_NAMEDCLASS;
11015
11016     PERL_ARGS_ASSERT_REGPPOSIXCC;
11017
11018     if (value == '[' && RExC_parse + 1 < RExC_end &&
11019         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11020         POSIXCC(UCHARAT(RExC_parse))) {
11021         const char c = UCHARAT(RExC_parse);
11022         char* const s = RExC_parse++;
11023
11024         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11025             RExC_parse++;
11026         if (RExC_parse == RExC_end)
11027             /* Grandfather lone [:, [=, [. */
11028             RExC_parse = s;
11029         else {
11030             const char* const t = RExC_parse++; /* skip over the c */
11031             assert(*t == c);
11032
11033             if (UCHARAT(RExC_parse) == ']') {
11034                 const char *posixcc = s + 1;
11035                 RExC_parse++; /* skip over the ending ] */
11036
11037                 if (*s == ':') {
11038                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11039                     const I32 skip = t - posixcc;
11040
11041                     /* Initially switch on the length of the name.  */
11042                     switch (skip) {
11043                     case 4:
11044                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11045                             namedclass = ANYOF_ALNUM;
11046                         break;
11047                     case 5:
11048                         /* Names all of length 5.  */
11049                         /* alnum alpha ascii blank cntrl digit graph lower
11050                            print punct space upper  */
11051                         /* Offset 4 gives the best switch position.  */
11052                         switch (posixcc[4]) {
11053                         case 'a':
11054                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11055                                 namedclass = ANYOF_ALPHA;
11056                             break;
11057                         case 'e':
11058                             if (memEQ(posixcc, "spac", 4)) /* space */
11059                                 namedclass = ANYOF_PSXSPC;
11060                             break;
11061                         case 'h':
11062                             if (memEQ(posixcc, "grap", 4)) /* graph */
11063                                 namedclass = ANYOF_GRAPH;
11064                             break;
11065                         case 'i':
11066                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11067                                 namedclass = ANYOF_ASCII;
11068                             break;
11069                         case 'k':
11070                             if (memEQ(posixcc, "blan", 4)) /* blank */
11071                                 namedclass = ANYOF_BLANK;
11072                             break;
11073                         case 'l':
11074                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11075                                 namedclass = ANYOF_CNTRL;
11076                             break;
11077                         case 'm':
11078                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11079                                 namedclass = ANYOF_ALNUMC;
11080                             break;
11081                         case 'r':
11082                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11083                                 namedclass = ANYOF_LOWER;
11084                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11085                                 namedclass = ANYOF_UPPER;
11086                             break;
11087                         case 't':
11088                             if (memEQ(posixcc, "digi", 4)) /* digit */
11089                                 namedclass = ANYOF_DIGIT;
11090                             else if (memEQ(posixcc, "prin", 4)) /* print */
11091                                 namedclass = ANYOF_PRINT;
11092                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11093                                 namedclass = ANYOF_PUNCT;
11094                             break;
11095                         }
11096                         break;
11097                     case 6:
11098                         if (memEQ(posixcc, "xdigit", 6))
11099                             namedclass = ANYOF_XDIGIT;
11100                         break;
11101                     }
11102
11103                     if (namedclass == OOB_NAMEDCLASS)
11104                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11105                                       t - s - 1, s + 1);
11106
11107                     /* The #defines are structured so each complement is +1 to
11108                      * the normal one */
11109                     if (complement) {
11110                         namedclass++;
11111                     }
11112                     assert (posixcc[skip] == ':');
11113                     assert (posixcc[skip+1] == ']');
11114                 } else if (!SIZE_ONLY) {
11115                     /* [[=foo=]] and [[.foo.]] are still future. */
11116
11117                     /* adjust RExC_parse so the warning shows after
11118                        the class closes */
11119                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11120                         RExC_parse++;
11121                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11122                 }
11123             } else {
11124                 /* Maternal grandfather:
11125                  * "[:" ending in ":" but not in ":]" */
11126                 RExC_parse = s;
11127             }
11128         }
11129     }
11130
11131     return namedclass;
11132 }
11133
11134 STATIC void
11135 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11136 {
11137     dVAR;
11138
11139     PERL_ARGS_ASSERT_CHECKPOSIXCC;
11140
11141     if (POSIXCC(UCHARAT(RExC_parse))) {
11142         const char *s = RExC_parse;
11143         const char  c = *s++;
11144
11145         while (isALNUM(*s))
11146             s++;
11147         if (*s && c == *s && s[1] == ']') {
11148             ckWARN3reg(s+2,
11149                        "POSIX syntax [%c %c] belongs inside character classes",
11150                        c, c);
11151
11152             /* [[=foo=]] and [[.foo.]] are still future. */
11153             if (POSIXCC_NOTYET(c)) {
11154                 /* adjust RExC_parse so the error shows after
11155                    the class closes */
11156                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11157                     NOOP;
11158                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11159             }
11160         }
11161     }
11162 }
11163
11164 /* Generate the code to add a full posix character <class> to the bracketed
11165  * character class given by <node>.  (<node> is needed only under locale rules)
11166  * destlist     is the inversion list for non-locale rules that this class is
11167  *              to be added to
11168  * sourcelist   is the ASCII-range inversion list to add under /a rules
11169  * Xsourcelist  is the full Unicode range list to use otherwise. */
11170 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
11171     if (LOC) {                                                             \
11172         SV* scratch_list = NULL;                                           \
11173                                                                            \
11174         /* Set this class in the node for runtime matching */              \
11175         ANYOF_CLASS_SET(node, class);                                      \
11176                                                                            \
11177         /* For above Latin1 code points, we use the full Unicode range */  \
11178         _invlist_intersection(PL_AboveLatin1,                              \
11179                               Xsourcelist,                                 \
11180                               &scratch_list);                              \
11181         /* And set the output to it, adding instead if there already is an \
11182          * output.  Checking if <destlist> is NULL first saves an extra    \
11183          * clone.  Its reference count will be decremented at the next     \
11184          * union, etc, or if this is the only instance, at the end of the  \
11185          * routine */                                                      \
11186         if (! destlist) {                                                  \
11187             destlist = scratch_list;                                       \
11188         }                                                                  \
11189         else {                                                             \
11190             _invlist_union(destlist, scratch_list, &destlist);             \
11191             SvREFCNT_dec(scratch_list);                                    \
11192         }                                                                  \
11193     }                                                                      \
11194     else {                                                                 \
11195         /* For non-locale, just add it to any existing list */             \
11196         _invlist_union(destlist,                                           \
11197                        (AT_LEAST_ASCII_RESTRICTED)                         \
11198                            ? sourcelist                                    \
11199                            : Xsourcelist,                                  \
11200                        &destlist);                                         \
11201     }
11202
11203 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11204  */
11205 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
11206     if (LOC) {                                                             \
11207         SV* scratch_list = NULL;                                           \
11208         ANYOF_CLASS_SET(node, class);                                      \
11209         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
11210         if (! destlist) {                                                  \
11211             destlist = scratch_list;                                       \
11212         }                                                                  \
11213         else {                                                             \
11214             _invlist_union(destlist, scratch_list, &destlist);             \
11215             SvREFCNT_dec(scratch_list);                                    \
11216         }                                                                  \
11217     }                                                                      \
11218     else {                                                                 \
11219         _invlist_union_complement_2nd(destlist,                            \
11220                                     (AT_LEAST_ASCII_RESTRICTED)            \
11221                                         ? sourcelist                       \
11222                                         : Xsourcelist,                     \
11223                                     &destlist);                            \
11224         /* Under /d, everything in the upper half of the Latin1 range      \
11225          * matches this complement */                                      \
11226         if (DEPENDS_SEMANTICS) {                                           \
11227             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
11228         }                                                                  \
11229     }
11230
11231 /* Generate the code to add a posix character <class> to the bracketed
11232  * character class given by <node>.  (<node> is needed only under locale rules)
11233  * destlist       is the inversion list for non-locale rules that this class is
11234  *                to be added to
11235  * sourcelist     is the ASCII-range inversion list to add under /a rules
11236  * l1_sourcelist  is the Latin1 range list to use otherwise.
11237  * Xpropertyname  is the name to add to <run_time_list> of the property to
11238  *                specify the code points above Latin1 that will have to be
11239  *                determined at run-time
11240  * run_time_list  is a SV* that contains text names of properties that are to
11241  *                be computed at run time.  This concatenates <Xpropertyname>
11242  *                to it, appropriately
11243  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11244  * time */
11245 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
11246                               l1_sourcelist, Xpropertyname, run_time_list) \
11247         /* First, resolve whether to use the ASCII-only list or the L1     \
11248          * list */                                                         \
11249         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
11250                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11251                 Xpropertyname, run_time_list)
11252
11253 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11254                 Xpropertyname, run_time_list)                              \
11255     /* If not /a matching, there are going to be code points we will have  \
11256      * to defer to runtime to look-up */                                   \
11257     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
11258         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11259     }                                                                      \
11260     if (LOC) {                                                             \
11261         ANYOF_CLASS_SET(node, class);                                      \
11262     }                                                                      \
11263     else {                                                                 \
11264         _invlist_union(destlist, sourcelist, &destlist);                   \
11265     }
11266
11267 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
11268  * this and DO_N_POSIX.  Sets <matches_above_unicode> only if it can; unchanged
11269  * otherwise */
11270 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
11271        l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11272     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
11273         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
11274     }                                                                      \
11275     else {                                                                 \
11276         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11277         matches_above_unicode = TRUE;                                      \
11278         if (LOC) {                                                         \
11279             ANYOF_CLASS_SET(node, namedclass);                             \
11280         }                                                                  \
11281         else {                                                             \
11282             SV* scratch_list = NULL;                                       \
11283             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
11284             if (! destlist) {                                              \
11285                 destlist = scratch_list;                                   \
11286             }                                                              \
11287             else {                                                         \
11288                 _invlist_union(destlist, scratch_list, &destlist);         \
11289                 SvREFCNT_dec(scratch_list);                                \
11290             }                                                              \
11291             if (DEPENDS_SEMANTICS) {                                       \
11292                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
11293             }                                                              \
11294         }                                                                  \
11295     }
11296
11297 STATIC void
11298 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
11299 {
11300     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
11301      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
11302      * the multi-character folds of characters in the node */
11303     SV *sv;
11304
11305     PERL_ARGS_ASSERT_ADD_ALTERNATE;
11306
11307     if (! *alternate_ptr) {
11308         *alternate_ptr = newAV();
11309     }
11310     sv = newSVpvn_utf8((char*)string, len, TRUE);
11311     av_push(*alternate_ptr, sv);
11312     return;
11313 }
11314
11315 /* The names of properties whose definitions are not known at compile time are
11316  * stored in this SV, after a constant heading.  So if the length has been
11317  * changed since initialization, then there is a run-time definition. */
11318 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11319
11320 /* This converts the named class defined in regcomp.h to its equivalent class
11321  * number defined in handy.h. */
11322 #define namedclass_to_classnum(class)  ((class) / 2)
11323
11324 /*
11325    parse a class specification and produce either an ANYOF node that
11326    matches the pattern or perhaps will be optimized into an EXACTish node
11327    instead. The node contains a bit map for the first 256 characters, with the
11328    corresponding bit set if that character is in the list.  For characters
11329    above 255, a range list is used */
11330
11331 STATIC regnode *
11332 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11333 {
11334     dVAR;
11335     UV nextvalue;
11336     UV prevvalue = OOB_UNICODE;
11337     IV range = 0;
11338     UV value = 0;
11339     regnode *ret;
11340     STRLEN numlen;
11341     IV namedclass = OOB_NAMEDCLASS;
11342     char *rangebegin = NULL;
11343     bool need_class = 0;
11344     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
11345     SV *listsv = NULL;
11346     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11347                                       than just initialized.  */
11348     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11349     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11350                                extended beyond the Latin1 range */
11351     UV element_count = 0;   /* Number of distinct elements in the class.
11352                                Optimizations may be possible if this is tiny */
11353     UV n;
11354
11355     /* Unicode properties are stored in a swash; this holds the current one
11356      * being parsed.  If this swash is the only above-latin1 component of the
11357      * character class, an optimization is to pass it directly on to the
11358      * execution engine.  Otherwise, it is set to NULL to indicate that there
11359      * are other things in the class that have to be dealt with at execution
11360      * time */
11361     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11362
11363     /* Set if a component of this character class is user-defined; just passed
11364      * on to the engine */
11365     bool has_user_defined_property = FALSE;
11366
11367     /* inversion list of code points this node matches only when the target
11368      * string is in UTF-8.  (Because is under /d) */
11369     SV* depends_list = NULL;
11370
11371     /* inversion list of code points this node matches.  For much of the
11372      * function, it includes only those that match regardless of the utf8ness
11373      * of the target string */
11374     SV* cp_list = NULL;
11375
11376     /* List of multi-character folds that are matched by this node */
11377     AV* unicode_alternate  = NULL;
11378 #ifdef EBCDIC
11379     /* In a range, counts how many 0-2 of the ends of it came from literals,
11380      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11381     UV literal_endpoint = 0;
11382 #endif
11383     bool invert = FALSE;    /* Is this class to be complemented */
11384
11385     /* Is there any thing like \W or [:^digit:] that matches above the legal
11386      * Unicode range? */
11387     bool runtime_posix_matches_above_Unicode = FALSE;
11388
11389     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11390         case we need to change the emitted regop to an EXACT. */
11391     const char * orig_parse = RExC_parse;
11392     const I32 orig_size = RExC_size;
11393     GET_RE_DEBUG_FLAGS_DECL;
11394
11395     PERL_ARGS_ASSERT_REGCLASS;
11396 #ifndef DEBUGGING
11397     PERL_UNUSED_ARG(depth);
11398 #endif
11399
11400     DEBUG_PARSE("clas");
11401
11402     /* Assume we are going to generate an ANYOF node. */
11403     ret = reganode(pRExC_state, ANYOF, 0);
11404
11405
11406     if (!SIZE_ONLY) {
11407         ANYOF_FLAGS(ret) = 0;
11408     }
11409
11410     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11411         RExC_naughty++;
11412         RExC_parse++;
11413         invert = TRUE;
11414
11415         /* We have decided to not allow multi-char folds in inverted character
11416          * classes, due to the confusion that can happen, especially with
11417          * classes that are designed for a non-Unicode world:  You have the
11418          * peculiar case that:
11419             "s s" =~ /^[^\xDF]+$/i => Y
11420             "ss"  =~ /^[^\xDF]+$/i => N
11421          *
11422          * See [perl #89750] */
11423         allow_full_fold = FALSE;
11424     }
11425
11426     if (SIZE_ONLY) {
11427         RExC_size += ANYOF_SKIP;
11428         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11429     }
11430     else {
11431         RExC_emit += ANYOF_SKIP;
11432         if (LOC) {
11433             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11434         }
11435         listsv = newSVpvs("# comment\n");
11436         initial_listsv_len = SvCUR(listsv);
11437     }
11438
11439     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11440
11441     if (!SIZE_ONLY && POSIXCC(nextvalue))
11442         checkposixcc(pRExC_state);
11443
11444     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11445     if (UCHARAT(RExC_parse) == ']')
11446         goto charclassloop;
11447
11448 parseit:
11449     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11450
11451     charclassloop:
11452
11453         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11454
11455         if (!range) {
11456             rangebegin = RExC_parse;
11457             element_count++;
11458         }
11459         if (UTF) {
11460             value = utf8n_to_uvchr((U8*)RExC_parse,
11461                                    RExC_end - RExC_parse,
11462                                    &numlen, UTF8_ALLOW_DEFAULT);
11463             RExC_parse += numlen;
11464         }
11465         else
11466             value = UCHARAT(RExC_parse++);
11467
11468         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11469         if (value == '[' && POSIXCC(nextvalue))
11470             namedclass = regpposixcc(pRExC_state, value);
11471         else if (value == '\\') {
11472             if (UTF) {
11473                 value = utf8n_to_uvchr((U8*)RExC_parse,
11474                                    RExC_end - RExC_parse,
11475                                    &numlen, UTF8_ALLOW_DEFAULT);
11476                 RExC_parse += numlen;
11477             }
11478             else
11479                 value = UCHARAT(RExC_parse++);
11480             /* Some compilers cannot handle switching on 64-bit integer
11481              * values, therefore value cannot be an UV.  Yes, this will
11482              * be a problem later if we want switch on Unicode.
11483              * A similar issue a little bit later when switching on
11484              * namedclass. --jhi */
11485             switch ((I32)value) {
11486             case 'w':   namedclass = ANYOF_ALNUM;       break;
11487             case 'W':   namedclass = ANYOF_NALNUM;      break;
11488             case 's':   namedclass = ANYOF_SPACE;       break;
11489             case 'S':   namedclass = ANYOF_NSPACE;      break;
11490             case 'd':   namedclass = ANYOF_DIGIT;       break;
11491             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11492             case 'v':   namedclass = ANYOF_VERTWS;      break;
11493             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11494             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11495             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11496             case 'N':  /* Handle \N{NAME} in class */
11497                 {
11498                     /* We only pay attention to the first char of
11499                     multichar strings being returned. I kinda wonder
11500                     if this makes sense as it does change the behaviour
11501                     from earlier versions, OTOH that behaviour was broken
11502                     as well. */
11503                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11504                                       TRUE /* => charclass */))
11505                     {
11506                         goto parseit;
11507                     }
11508                 }
11509                 break;
11510             case 'p':
11511             case 'P':
11512                 {
11513                 char *e;
11514                 if (RExC_parse >= RExC_end)
11515                     vFAIL2("Empty \\%c{}", (U8)value);
11516                 if (*RExC_parse == '{') {
11517                     const U8 c = (U8)value;
11518                     e = strchr(RExC_parse++, '}');
11519                     if (!e)
11520                         vFAIL2("Missing right brace on \\%c{}", c);
11521                     while (isSPACE(UCHARAT(RExC_parse)))
11522                         RExC_parse++;
11523                     if (e == RExC_parse)
11524                         vFAIL2("Empty \\%c{}", c);
11525                     n = e - RExC_parse;
11526                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11527                         n--;
11528                 }
11529                 else {
11530                     e = RExC_parse;
11531                     n = 1;
11532                 }
11533                 if (!SIZE_ONLY) {
11534                     SV* invlist;
11535                     char* name;
11536
11537                     if (UCHARAT(RExC_parse) == '^') {
11538                          RExC_parse++;
11539                          n--;
11540                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11541                          while (isSPACE(UCHARAT(RExC_parse))) {
11542                               RExC_parse++;
11543                               n--;
11544                          }
11545                     }
11546                     /* Try to get the definition of the property into
11547                      * <invlist>.  If /i is in effect, the effective property
11548                      * will have its name be <__NAME_i>.  The design is
11549                      * discussed in commit
11550                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11551                     Newx(name, n + sizeof("_i__\n"), char);
11552
11553                     sprintf(name, "%s%.*s%s\n",
11554                                     (FOLD) ? "__" : "",
11555                                     (int)n,
11556                                     RExC_parse,
11557                                     (FOLD) ? "_i" : ""
11558                     );
11559
11560                     /* Look up the property name, and get its swash and
11561                      * inversion list, if the property is found  */
11562                     if (swash) {
11563                         SvREFCNT_dec(swash);
11564                     }
11565                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11566                                              1, /* binary */
11567                                              0, /* not tr/// */
11568                                              TRUE, /* this routine will handle
11569                                                       undefined properties */
11570                                              NULL, FALSE /* No inversion list */
11571                                             );
11572                     if (   ! swash
11573                         || ! SvROK(swash)
11574                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11575                         || ! (invlist = _get_swash_invlist(swash)))
11576                     {
11577                         if (swash) {
11578                             SvREFCNT_dec(swash);
11579                             swash = NULL;
11580                         }
11581
11582                         /* Here didn't find it.  It could be a user-defined
11583                          * property that will be available at run-time.  Add it
11584                          * to the list to look up then */
11585                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11586                                         (value == 'p' ? '+' : '!'),
11587                                         name);
11588                         has_user_defined_property = TRUE;
11589
11590                         /* We don't know yet, so have to assume that the
11591                          * property could match something in the Latin1 range,
11592                          * hence something that isn't utf8.  Note that this
11593                          * would cause things in <depends_list> to match
11594                          * inappropriately, except that any \p{}, including
11595                          * this one forces Unicode semantics, which means there
11596                          * is <no depends_list> */
11597                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11598                     }
11599                     else {
11600
11601                         /* Here, did get the swash and its inversion list.  If
11602                          * the swash is from a user-defined property, then this
11603                          * whole character class should be regarded as such */
11604                         has_user_defined_property =
11605                                                 _is_swash_user_defined(swash);
11606
11607                         /* Invert if asking for the complement */
11608                         if (value == 'P') {
11609                             _invlist_union_complement_2nd(properties,
11610                                                           invlist,
11611                                                           &properties);
11612
11613                             /* The swash can't be used as-is, because we've
11614                              * inverted things; delay removing it to here after
11615                              * have copied its invlist above */
11616                             SvREFCNT_dec(swash);
11617                             swash = NULL;
11618                         }
11619                         else {
11620                             _invlist_union(properties, invlist, &properties);
11621                         }
11622                     }
11623                     Safefree(name);
11624                 }
11625                 RExC_parse = e + 1;
11626                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
11627
11628                 /* \p means they want Unicode semantics */
11629                 RExC_uni_semantics = 1;
11630                 }
11631                 break;
11632             case 'n':   value = '\n';                   break;
11633             case 'r':   value = '\r';                   break;
11634             case 't':   value = '\t';                   break;
11635             case 'f':   value = '\f';                   break;
11636             case 'b':   value = '\b';                   break;
11637             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11638             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11639             case 'o':
11640                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11641                 {
11642                     const char* error_msg;
11643                     bool valid = grok_bslash_o(RExC_parse,
11644                                                &value,
11645                                                &numlen,
11646                                                &error_msg,
11647                                                SIZE_ONLY);
11648                     RExC_parse += numlen;
11649                     if (! valid) {
11650                         vFAIL(error_msg);
11651                     }
11652                 }
11653                 if (PL_encoding && value < 0x100) {
11654                     goto recode_encoding;
11655                 }
11656                 break;
11657             case 'x':
11658                 RExC_parse--;   /* function expects to be pointed at the 'x' */
11659                 {
11660                     const char* error_msg;
11661                     bool valid = grok_bslash_x(RExC_parse,
11662                                                &value,
11663                                                &numlen,
11664                                                &error_msg,
11665                                                1);
11666                     RExC_parse += numlen;
11667                     if (! valid) {
11668                         vFAIL(error_msg);
11669                     }
11670                 }
11671                 if (PL_encoding && value < 0x100)
11672                     goto recode_encoding;
11673                 break;
11674             case 'c':
11675                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11676                 break;
11677             case '0': case '1': case '2': case '3': case '4':
11678             case '5': case '6': case '7':
11679                 {
11680                     /* Take 1-3 octal digits */
11681                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11682                     numlen = 3;
11683                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11684                     RExC_parse += numlen;
11685                     if (PL_encoding && value < 0x100)
11686                         goto recode_encoding;
11687                     break;
11688                 }
11689             recode_encoding:
11690                 if (! RExC_override_recoding) {
11691                     SV* enc = PL_encoding;
11692                     value = reg_recode((const char)(U8)value, &enc);
11693                     if (!enc && SIZE_ONLY)
11694                         ckWARNreg(RExC_parse,
11695                                   "Invalid escape in the specified encoding");
11696                     break;
11697                 }
11698             default:
11699                 /* Allow \_ to not give an error */
11700                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11701                     ckWARN2reg(RExC_parse,
11702                                "Unrecognized escape \\%c in character class passed through",
11703                                (int)value);
11704                 }
11705                 break;
11706             }
11707         } /* end of \blah */
11708 #ifdef EBCDIC
11709         else
11710             literal_endpoint++;
11711 #endif
11712
11713             /* What matches in a locale is not known until runtime.  This
11714              * includes what the Posix classes (like \w, [:space:]) match.
11715              * Room must be reserved (one time per class) to store such
11716              * classes, either if Perl is compiled so that locale nodes always
11717              * should have this space, or if there is such class info to be
11718              * stored.  The space will contain a bit for each named class that
11719              * is to be matched against.  This isn't needed for \p{} and
11720              * pseudo-classes, as they are not affected by locale, and hence
11721              * are dealt with separately */
11722             if (LOC
11723                 && ! need_class
11724                 && (ANYOF_LOCALE == ANYOF_CLASS
11725                     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11726             {
11727                 need_class = 1;
11728                 if (SIZE_ONLY) {
11729                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11730                 }
11731                 else {
11732                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11733                     ANYOF_CLASS_ZERO(ret);
11734                 }
11735                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11736             }
11737
11738         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11739
11740             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11741              * literal, as is the character that began the false range, i.e.
11742              * the 'a' in the examples */
11743             if (range) {
11744                 if (!SIZE_ONLY) {
11745                     const int w =
11746                         RExC_parse >= rangebegin ?
11747                         RExC_parse - rangebegin : 0;
11748                     ckWARN4reg(RExC_parse,
11749                                "False [] range \"%*.*s\"",
11750                                w, w, rangebegin);
11751                     cp_list = add_cp_to_invlist(cp_list, '-');
11752                     cp_list = add_cp_to_invlist(cp_list, prevvalue);
11753                 }
11754
11755                 range = 0; /* this was not a true range */
11756                 element_count += 2; /* So counts for three values */
11757             }
11758
11759             if (! SIZE_ONLY) {
11760                 switch ((I32)namedclass) {
11761
11762                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11763                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11764                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11765                     break;
11766                 case ANYOF_NALNUMC:
11767                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11768                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11769                         runtime_posix_matches_above_Unicode);
11770                     break;
11771                 case ANYOF_ALPHA:
11772                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11773                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11774                     break;
11775                 case ANYOF_NALPHA:
11776                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11777                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11778                         runtime_posix_matches_above_Unicode);
11779                     break;
11780                 case ANYOF_ASCII:
11781                     if (LOC) {
11782                         ANYOF_CLASS_SET(ret, namedclass);
11783                     }
11784                     else {
11785                         _invlist_union(posixes, PL_ASCII, &posixes);
11786                     }
11787                     break;
11788                 case ANYOF_NASCII:
11789                     if (LOC) {
11790                         ANYOF_CLASS_SET(ret, namedclass);
11791                     }
11792                     else {
11793                         _invlist_union_complement_2nd(posixes,
11794                                                     PL_ASCII, &posixes);
11795                         if (DEPENDS_SEMANTICS) {
11796                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11797                         }
11798                     }
11799                     break;
11800                 case ANYOF_BLANK:
11801                     DO_POSIX(ret, namedclass, posixes,
11802                                             PL_PosixBlank, PL_XPosixBlank);
11803                     break;
11804                 case ANYOF_NBLANK:
11805                     DO_N_POSIX(ret, namedclass, posixes,
11806                                             PL_PosixBlank, PL_XPosixBlank);
11807                     break;
11808                 case ANYOF_CNTRL:
11809                     DO_POSIX(ret, namedclass, posixes,
11810                                             PL_PosixCntrl, PL_XPosixCntrl);
11811                     break;
11812                 case ANYOF_NCNTRL:
11813                     DO_N_POSIX(ret, namedclass, posixes,
11814                                             PL_PosixCntrl, PL_XPosixCntrl);
11815                     break;
11816                 case ANYOF_DIGIT:
11817                     /* There are no digits in the Latin1 range outside of
11818                      * ASCII, so call the macro that doesn't have to resolve
11819                      * them */
11820                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11821                         PL_PosixDigit, "XPosixDigit", listsv);
11822                     break;
11823                 case ANYOF_NDIGIT:
11824                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11825                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11826                         runtime_posix_matches_above_Unicode);
11827                     break;
11828                 case ANYOF_GRAPH:
11829                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11830                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11831                     break;
11832                 case ANYOF_NGRAPH:
11833                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11834                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11835                         runtime_posix_matches_above_Unicode);
11836                     break;
11837                 case ANYOF_HORIZWS:
11838                     /* For these, we use the cp_list, as /d doesn't make a
11839                      * difference in what these match.  There would be problems
11840                      * if these characters had folds other than themselves, as
11841                      * cp_list is subject to folding.  It turns out that \h
11842                      * is just a synonym for XPosixBlank */
11843                     _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
11844                     break;
11845                 case ANYOF_NHORIZWS:
11846                     _invlist_union_complement_2nd(cp_list,
11847                                                  PL_XPosixBlank, &cp_list);
11848                     break;
11849                 case ANYOF_LOWER:
11850                 case ANYOF_NLOWER:
11851                 {   /* These require special handling, as they differ under
11852                        folding, matching Cased there (which in the ASCII range
11853                        is the same as Alpha */
11854
11855                     SV* ascii_source;
11856                     SV* l1_source;
11857                     const char *Xname;
11858
11859                     if (FOLD && ! LOC) {
11860                         ascii_source = PL_PosixAlpha;
11861                         l1_source = PL_L1Cased;
11862                         Xname = "Cased";
11863                     }
11864                     else {
11865                         ascii_source = PL_PosixLower;
11866                         l1_source = PL_L1PosixLower;
11867                         Xname = "XPosixLower";
11868                     }
11869                     if (namedclass == ANYOF_LOWER) {
11870                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11871                                     ascii_source, l1_source, Xname, listsv);
11872                     }
11873                     else {
11874                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11875                             posixes, ascii_source, l1_source, Xname, listsv,
11876                             runtime_posix_matches_above_Unicode);
11877                     }
11878                     break;
11879                 }
11880                 case ANYOF_PRINT:
11881                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11882                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11883                     break;
11884                 case ANYOF_NPRINT:
11885                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11886                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
11887                         runtime_posix_matches_above_Unicode);
11888                     break;
11889                 case ANYOF_PUNCT:
11890                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11891                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11892                     break;
11893                 case ANYOF_NPUNCT:
11894                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11895                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
11896                         runtime_posix_matches_above_Unicode);
11897                     break;
11898                 case ANYOF_PSXSPC:
11899                     DO_POSIX(ret, namedclass, posixes,
11900                                             PL_PosixSpace, PL_XPosixSpace);
11901                     break;
11902                 case ANYOF_NPSXSPC:
11903                     DO_N_POSIX(ret, namedclass, posixes,
11904                                             PL_PosixSpace, PL_XPosixSpace);
11905                     break;
11906                 case ANYOF_SPACE:
11907                     DO_POSIX(ret, namedclass, posixes,
11908                                             PL_PerlSpace, PL_XPerlSpace);
11909                     break;
11910                 case ANYOF_NSPACE:
11911                     DO_N_POSIX(ret, namedclass, posixes,
11912                                             PL_PerlSpace, PL_XPerlSpace);
11913                     break;
11914                 case ANYOF_UPPER:   /* Same as LOWER, above */
11915                 case ANYOF_NUPPER:
11916                 {
11917                     SV* ascii_source;
11918                     SV* l1_source;
11919                     const char *Xname;
11920
11921                     if (FOLD && ! LOC) {
11922                         ascii_source = PL_PosixAlpha;
11923                         l1_source = PL_L1Cased;
11924                         Xname = "Cased";
11925                     }
11926                     else {
11927                         ascii_source = PL_PosixUpper;
11928                         l1_source = PL_L1PosixUpper;
11929                         Xname = "XPosixUpper";
11930                     }
11931                     if (namedclass == ANYOF_UPPER) {
11932                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11933                                     ascii_source, l1_source, Xname, listsv);
11934                     }
11935                     else {
11936                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11937                         posixes, ascii_source, l1_source, Xname, listsv,
11938                         runtime_posix_matches_above_Unicode);
11939                     }
11940                     break;
11941                 }
11942                 case ANYOF_ALNUM:   /* Really is 'Word' */
11943                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11944                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11945                     break;
11946                 case ANYOF_NALNUM:
11947                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11948                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
11949                             runtime_posix_matches_above_Unicode);
11950                     break;
11951                 case ANYOF_VERTWS:
11952                     /* For these, we use the cp_list, as /d doesn't make a
11953                      * difference in what these match.  There would be problems
11954                      * if these characters had folds other than themselves, as
11955                      * cp_list is subject to folding */
11956                     _invlist_union(cp_list, PL_VertSpace, &cp_list);
11957                     break;
11958                 case ANYOF_NVERTWS:
11959                     _invlist_union_complement_2nd(cp_list,
11960                                                     PL_VertSpace, &cp_list);
11961                     break;
11962                 case ANYOF_XDIGIT:
11963                     DO_POSIX(ret, namedclass, posixes,
11964                                             PL_PosixXDigit, PL_XPosixXDigit);
11965                     break;
11966                 case ANYOF_NXDIGIT:
11967                     DO_N_POSIX(ret, namedclass, posixes,
11968                                             PL_PosixXDigit, PL_XPosixXDigit);
11969                     break;
11970                 case ANYOF_MAX:
11971                     /* this is to handle \p and \P */
11972                     break;
11973                 default:
11974                     vFAIL("Invalid [::] class");
11975                     break;
11976                 }
11977
11978                 continue;   /* Go get next character */
11979             }
11980         } /* end of namedclass \blah */
11981
11982         if (range) {
11983             if (prevvalue > value) /* b-a */ {
11984                 const int w = RExC_parse - rangebegin;
11985                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11986                 range = 0; /* not a valid range */
11987             }
11988         }
11989         else {
11990             prevvalue = value; /* save the beginning of the potential range */
11991             if (RExC_parse+1 < RExC_end
11992                 && *RExC_parse == '-'
11993                 && RExC_parse[1] != ']')
11994             {
11995                 RExC_parse++;
11996
11997                 /* a bad range like \w-, [:word:]- ? */
11998                 if (namedclass > OOB_NAMEDCLASS) {
11999                     if (ckWARN(WARN_REGEXP)) {
12000                         const int w =
12001                             RExC_parse >= rangebegin ?
12002                             RExC_parse - rangebegin : 0;
12003                         vWARN4(RExC_parse,
12004                                "False [] range \"%*.*s\"",
12005                                w, w, rangebegin);
12006                     }
12007                     if (!SIZE_ONLY) {
12008                         cp_list = add_cp_to_invlist(cp_list, '-');
12009                     }
12010                     element_count++;
12011                 } else
12012                     range = 1;  /* yeah, it's a range! */
12013                 continue;       /* but do it the next time */
12014             }
12015         }
12016
12017         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12018          * if not */
12019
12020         /* non-Latin1 code point implies unicode semantics.  Must be set in
12021          * pass1 so is there for the whole of pass 2 */
12022         if (value > 255) {
12023             RExC_uni_semantics = 1;
12024         }
12025
12026         /* Ready to process either the single value, or the completed range */
12027         if (!SIZE_ONLY) {
12028 #ifndef EBCDIC
12029             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12030 #else
12031             UV* this_range = _new_invlist(1);
12032             _append_range_to_invlist(this_range, prevvalue, value);
12033
12034             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12035              * If this range was specified using something like 'i-j', we want
12036              * to include only the 'i' and the 'j', and not anything in
12037              * between, so exclude non-ASCII, non-alphabetics from it.
12038              * However, if the range was specified with something like
12039              * [\x89-\x91] or [\x89-j], all code points within it should be
12040              * included.  literal_endpoint==2 means both ends of the range used
12041              * a literal character, not \x{foo} */
12042             if (literal_endpoint == 2
12043                 && (prevvalue >= 'a' && value <= 'z')
12044                     || (prevvalue >= 'A' && value <= 'Z'))
12045             {
12046                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12047                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12048             }
12049             _invlist_union(cp_list, this_range, &cp_list);
12050             literal_endpoint = 0;
12051 #endif
12052         }
12053
12054         range = 0; /* this range (if it was one) is done now */
12055     } /* End of loop through all the text within the brackets */
12056
12057     /* If the character class contains only a single element, it may be
12058      * optimizable into another node type which is smaller and runs faster.
12059      * Check if this is the case for this class */
12060     if (element_count == 1) {
12061         U8 op = END;
12062         U8 arg = 0;
12063
12064         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12065                                               [:digit:] or \p{foo} */
12066
12067             /* Certain named classes have equivalents that can appear outside a
12068              * character class, e.g. \w, \H.  We use these instead of a
12069              * character class. */
12070             switch ((I32)namedclass) {
12071                 U8 offset;
12072
12073                 /* The first group is for node types that depend on the charset
12074                  * modifier to the regex.  We first calculate the base node
12075                  * type, and if it should be inverted */
12076
12077                 case ANYOF_NALNUM:
12078                     invert = ! invert;
12079                     /* FALLTHROUGH */
12080                 case ANYOF_ALNUM:
12081                     op = ALNUM;
12082                     goto join_charset_classes;
12083
12084                 case ANYOF_NSPACE:
12085                     invert = ! invert;
12086                     /* FALLTHROUGH */
12087                 case ANYOF_SPACE:
12088                     op = SPACE;
12089                     goto join_charset_classes;
12090
12091                 case ANYOF_NDIGIT:
12092                     invert = ! invert;
12093                     /* FALLTHROUGH */
12094                 case ANYOF_DIGIT:
12095                     op = DIGIT;
12096
12097                   join_charset_classes:
12098
12099                     /* Now that we have the base node type, we take advantage
12100                      * of the enum ordering of the charset modifiers to get the
12101                      * exact node type,  For example the base SPACE also has
12102                      * SPACEL, SPACEU, and SPACEA */
12103
12104                     offset = get_regex_charset(RExC_flags);
12105
12106                     /* /aa is the same as /a for these */
12107                     if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12108                         offset = REGEX_ASCII_RESTRICTED_CHARSET;
12109                     }
12110                     else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12111                         offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12112                     }
12113
12114                     op += offset;
12115
12116                     /* The number of varieties of each of these is the same,
12117                      * hence, so is the delta between the normal and
12118                      * complemented nodes */
12119                     if (invert) {
12120                         op += NALNUM - ALNUM;
12121                     }
12122                     *flagp |= HASWIDTH|SIMPLE;
12123                     break;
12124
12125                 /* The second group doesn't depend of the charset modifiers.
12126                  * We just have normal and complemented */
12127                 case ANYOF_NHORIZWS:
12128                     invert = ! invert;
12129                     /* FALLTHROUGH */
12130                 case ANYOF_HORIZWS:
12131                   is_horizws:
12132                     op = (invert) ? NHORIZWS : HORIZWS;
12133                     *flagp |= HASWIDTH|SIMPLE;
12134                     break;
12135
12136                 case ANYOF_NVERTWS:
12137                     invert = ! invert;
12138                     /* FALLTHROUGH */
12139                 case ANYOF_VERTWS:
12140                     op = (invert) ? NVERTWS : VERTWS;
12141                     *flagp |= HASWIDTH|SIMPLE;
12142                     break;
12143
12144                 case ANYOF_MAX:
12145                     break;
12146
12147                 case ANYOF_NBLANK:
12148                     invert = ! invert;
12149                     /* FALLTHROUGH */
12150                 case ANYOF_BLANK:
12151                     if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12152                         goto is_horizws;
12153                     }
12154                     /* FALLTHROUGH */
12155                 default:
12156                     /* A generic posix class.  All the /a ones can be handled
12157                      * by the POSIXA opcode.  And all are closed under folding
12158                      * in the ASCII range, so FOLD doesn't matter */
12159                     if (AT_LEAST_ASCII_RESTRICTED
12160                         || (! LOC && namedclass == ANYOF_ASCII))
12161                     {
12162                         /* The odd numbered ones are the complements of the
12163                          * next-lower even number one */
12164                         if (namedclass % 2 == 1) {
12165                             invert = ! invert;
12166                             namedclass--;
12167                         }
12168                         arg = namedclass_to_classnum(namedclass);
12169                         op = (invert) ? NPOSIXA : POSIXA;
12170                     }
12171                     break;
12172             }
12173         }
12174         else if (value == prevvalue) {
12175
12176             /* Here, the class consists of just a single code point */
12177
12178             if (invert) {
12179                 if (! LOC && value == '\n') {
12180                     op = REG_ANY; /* Optimize [^\n] */
12181                     *flagp |= HASWIDTH|SIMPLE;
12182                     RExC_naughty++;
12183                 }
12184             }
12185             else if (value < 256 || UTF) {
12186
12187                 /* Optimize a single value into an EXACTish node, but not if it
12188                  * would require converting the pattern to UTF-8. */
12189                 op = compute_EXACTish(pRExC_state);
12190             }
12191         } /* Otherwise is a range */
12192         else if (! LOC) {   /* locale could vary these */
12193             if (prevvalue == '0') {
12194                 if (value == '9') {
12195                     op = (invert) ? NDIGITA : DIGITA;
12196                     *flagp |= HASWIDTH|SIMPLE;
12197                 }
12198             }
12199         }
12200
12201         /* Here, we have changed <op> away from its initial value iff we found
12202          * an optimization */
12203         if (op != END) {
12204
12205             /* Throw away this ANYOF regnode, and emit the calculated one,
12206              * which should correspond to the beginning, not current, state of
12207              * the parse */
12208             const char * cur_parse = RExC_parse;
12209             RExC_parse = (char *)orig_parse;
12210             if ( SIZE_ONLY) {
12211                 if (! LOC) {
12212
12213                     /* To get locale nodes to not use the full ANYOF size would
12214                      * require moving the code above that writes the portions
12215                      * of it that aren't in other nodes to after this point.
12216                      * e.g.  ANYOF_CLASS_SET */
12217                     RExC_size = orig_size;
12218                 }
12219             }
12220             else {
12221                 RExC_emit = (regnode *)orig_emit;
12222             }
12223
12224             ret = reg_node(pRExC_state, op);
12225
12226             if (PL_regkind[op] == POSIXD) {
12227                 if (! SIZE_ONLY) {
12228                     FLAGS(ret) = arg;
12229                 }
12230                 *flagp |= HASWIDTH|SIMPLE;
12231             }
12232             else if (PL_regkind[op] == EXACT) {
12233                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12234             }
12235
12236             RExC_parse = (char *) cur_parse;
12237
12238             SvREFCNT_dec(listsv);
12239             return ret;
12240         }
12241     }
12242
12243     if (SIZE_ONLY)
12244         return ret;
12245     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12246
12247     /* If folding, we calculate all characters that could fold to or from the
12248      * ones already on the list */
12249     if (FOLD && cp_list) {
12250         UV start, end;  /* End points of code point ranges */
12251
12252         SV* fold_intersection = NULL;
12253
12254         /* In the Latin1 range, the characters that can be folded-to or -from
12255          * are precisely the alphabetic characters.  If the highest code point
12256          * is within Latin1, we can use the compiled-in list, and not have to
12257          * go out to disk. */
12258         if (invlist_highest(cp_list) < 256) {
12259             _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12260         }
12261         else {
12262
12263             /* Here, there are non-Latin1 code points, so we will have to go
12264              * fetch the list of all the characters that participate in folds
12265              */
12266             if (! PL_utf8_foldable) {
12267                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12268                                        &PL_sv_undef, 1, 0);
12269                 PL_utf8_foldable = _get_swash_invlist(swash);
12270                 SvREFCNT_dec(swash);
12271             }
12272
12273             /* This is a hash that for a particular fold gives all characters
12274              * that are involved in it */
12275             if (! PL_utf8_foldclosures) {
12276
12277                 /* If we were unable to find any folds, then we likely won't be
12278                  * able to find the closures.  So just create an empty list.
12279                  * Folding will effectively be restricted to the non-Unicode
12280                  * rules hard-coded into Perl.  (This case happens legitimately
12281                  * during compilation of Perl itself before the Unicode tables
12282                  * are generated) */
12283                 if (invlist_len(PL_utf8_foldable) == 0) {
12284                     PL_utf8_foldclosures = newHV();
12285                 }
12286                 else {
12287                     /* If the folds haven't been read in, call a fold function
12288                      * to force that */
12289                     if (! PL_utf8_tofold) {
12290                         U8 dummy[UTF8_MAXBYTES+1];
12291                         STRLEN dummy_len;
12292
12293                         /* This particular string is above \xff in both UTF-8
12294                          * and UTFEBCDIC */
12295                         to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
12296                         assert(PL_utf8_tofold); /* Verify that worked */
12297                     }
12298                     PL_utf8_foldclosures =
12299                                         _swash_inversion_hash(PL_utf8_tofold);
12300                 }
12301             }
12302
12303             /* Only the characters in this class that participate in folds need
12304              * be checked.  Get the intersection of this class and all the
12305              * possible characters that are foldable.  This can quickly narrow
12306              * down a large class */
12307             _invlist_intersection(PL_utf8_foldable, cp_list,
12308                                   &fold_intersection);
12309         }
12310
12311         /* Now look at the foldable characters in this class individually */
12312         invlist_iterinit(fold_intersection);
12313         while (invlist_iternext(fold_intersection, &start, &end)) {
12314             UV j;
12315
12316             /* Locale folding for Latin1 characters is deferred until runtime */
12317             if (LOC && start < 256) {
12318                 start = 256;
12319             }
12320
12321             /* Look at every character in the range */
12322             for (j = start; j <= end; j++) {
12323
12324                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12325                 STRLEN foldlen;
12326                 UV f;
12327
12328                 if (j < 256) {
12329
12330                     /* We have the latin1 folding rules hard-coded here so that
12331                      * an innocent-looking character class, like /[ks]/i won't
12332                      * have to go out to disk to find the possible matches.
12333                      * XXX It would be better to generate these via regen, in
12334                      * case a new version of the Unicode standard adds new
12335                      * mappings, though that is not really likely, and may be
12336                      * caught by the default: case of the switch below. */
12337
12338                     if (PL_fold_latin1[j] != j) {
12339
12340                         /* ASCII is always matched; non-ASCII is matched only
12341                          * under Unicode rules */
12342                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12343                             cp_list =
12344                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12345                         }
12346                         else {
12347                             depends_list =
12348                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12349                         }
12350                     }
12351
12352                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12353                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12354                     {
12355                         /* Certain Latin1 characters have matches outside
12356                          * Latin1, or are multi-character.  To get here, 'j' is
12357                          * one of those characters.   None of these matches is
12358                          * valid for ASCII characters under /aa, which is why
12359                          * the 'if' just above excludes those.  The matches
12360                          * fall into three categories:
12361                          * 1) They are singly folded-to or -from an above 255
12362                          *    character, e.g., LATIN SMALL LETTER Y WITH
12363                          *    DIAERESIS and LATIN CAPITAL LETTER Y WITH
12364                          *    DIAERESIS;
12365                          * 2) They are part of a multi-char fold with another
12366                          *    latin1 character; only LATIN SMALL LETTER
12367                          *    SHARP S => "ss" fits this;
12368                          * 3) They are part of a multi-char fold with a
12369                          *    character outside of Latin1, such as various
12370                          *    ligatures.
12371                         * We aren't dealing fully with multi-char folds, except
12372                         * we do deal with the pattern containing a character
12373                         * that has a multi-char fold (not so much the inverse).
12374                         * For types 1) and 3), the matches only happen when the
12375                         * target string is utf8; that's not true for 2), and we
12376                         * set a flag for it.
12377                         *
12378                         * The code below adds the single fold closures for 'j'
12379                         * to the inversion list. */
12380                         switch (j) {
12381                             case 'k':
12382                             case 'K':
12383                                 cp_list =
12384                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
12385                                 break;
12386                             case 's':
12387                             case 'S':
12388                                 cp_list = add_cp_to_invlist(cp_list,
12389                                                     LATIN_SMALL_LETTER_LONG_S);
12390                                 break;
12391                             case MICRO_SIGN:
12392                                 cp_list = add_cp_to_invlist(cp_list,
12393                                                     GREEK_CAPITAL_LETTER_MU);
12394                                 cp_list = add_cp_to_invlist(cp_list,
12395                                                     GREEK_SMALL_LETTER_MU);
12396                                 break;
12397                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12398                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12399                                 cp_list =
12400                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12401                                 break;
12402                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12403                                 cp_list = add_cp_to_invlist(cp_list,
12404                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12405                                 break;
12406                             case LATIN_SMALL_LETTER_SHARP_S:
12407                                 cp_list = add_cp_to_invlist(cp_list,
12408                                                 LATIN_CAPITAL_LETTER_SHARP_S);
12409
12410                                 /* Under /a, /d, and /u, this can match the two
12411                                  * chars "ss" */
12412                                 if (! ASCII_FOLD_RESTRICTED) {
12413                                     add_alternate(&unicode_alternate,
12414                                                   (U8 *) "ss", 2);
12415
12416                                     /* And under /u or /a, it can match even if
12417                                      * the target is not utf8 */
12418                                     if (AT_LEAST_UNI_SEMANTICS) {
12419                                         ANYOF_FLAGS(ret) |=
12420                                                     ANYOF_NONBITMAP_NON_UTF8;
12421                                     }
12422                                 }
12423                                 break;
12424                             case 'F': case 'f':
12425                             case 'I': case 'i':
12426                             case 'L': case 'l':
12427                             case 'T': case 't':
12428                             case 'A': case 'a':
12429                             case 'H': case 'h':
12430                             case 'J': case 'j':
12431                             case 'N': case 'n':
12432                             case 'W': case 'w':
12433                             case 'Y': case 'y':
12434                                 /* These all are targets of multi-character
12435                                  * folds from code points that require UTF8 to
12436                                  * express, so they can't match unless the
12437                                  * target string is in UTF-8, so no action here
12438                                  * is necessary, as regexec.c properly handles
12439                                  * the general case for UTF-8 matching */
12440                                 break;
12441                             default:
12442                                 /* Use deprecated warning to increase the
12443                                  * chances of this being output */
12444                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12445                                 break;
12446                         }
12447                     }
12448                     continue;
12449                 }
12450
12451                 /* Here is an above Latin1 character.  We don't have the rules
12452                  * hard-coded for it.  First, get its fold */
12453                 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12454                                     ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
12455                                     | ((LOC)
12456                                         ? FOLD_FLAGS_LOCALE
12457                                         : (ASCII_FOLD_RESTRICTED)
12458                                             ? FOLD_FLAGS_NOMIX_ASCII
12459                                             : 0));
12460
12461                 if (foldlen > (STRLEN)UNISKIP(f)) {
12462
12463                     /* Any multicharacter foldings (disallowed in lookbehind
12464                      * patterns) require the following transform: [ABCDEF] ->
12465                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12466                      * folds into "rst", all other characters fold to single
12467                      * characters.  We save away these multicharacter foldings,
12468                      * to be later saved as part of the additional "s" data. */
12469                     if (! RExC_in_lookbehind) {
12470                         U8* loc = foldbuf;
12471                         U8* e = foldbuf + foldlen;
12472
12473                         /* If any of the folded characters of this are in the
12474                          * Latin1 range, tell the regex engine that this can
12475                          * match a non-utf8 target string.  */
12476                         while (loc < e) {
12477                             if (UTF8_IS_INVARIANT(*loc)
12478                                 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12479                             {
12480                                 ANYOF_FLAGS(ret)
12481                                         |= ANYOF_NONBITMAP_NON_UTF8;
12482                                 break;
12483                             }
12484                             loc += UTF8SKIP(loc);
12485                         }
12486
12487                         add_alternate(&unicode_alternate, foldbuf, foldlen);
12488                     }
12489                 }
12490                 else {
12491                     /* Single character fold of above Latin1.  Add everything
12492                      * in its fold closure to the list that this node should
12493                      * match */
12494                     SV** listp;
12495
12496                     /* The fold closures data structure is a hash with the keys
12497                      * being every character that is folded to, like 'k', and
12498                      * the values each an array of everything that folds to its
12499                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
12500                     if ((listp = hv_fetch(PL_utf8_foldclosures,
12501                                     (char *) foldbuf, foldlen, FALSE)))
12502                     {
12503                         AV* list = (AV*) *listp;
12504                         IV k;
12505                         for (k = 0; k <= av_len(list); k++) {
12506                             SV** c_p = av_fetch(list, k, FALSE);
12507                             UV c;
12508                             if (c_p == NULL) {
12509                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12510                             }
12511                             c = SvUV(*c_p);
12512
12513                             /* /aa doesn't allow folds between ASCII and non-;
12514                              * /l doesn't allow them between above and below
12515                              * 256 */
12516                             if ((ASCII_FOLD_RESTRICTED
12517                                       && (isASCII(c) != isASCII(j)))
12518                                 || (LOC && ((c < 256) != (j < 256))))
12519                             {
12520                                 continue;
12521                             }
12522
12523                             /* Folds involving non-ascii Latin1 characters
12524                              * under /d are added to a separate list */
12525                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12526                             {
12527                                 cp_list = add_cp_to_invlist(cp_list, c);
12528                             }
12529                             else {
12530                               depends_list = add_cp_to_invlist(depends_list, c);
12531                             }
12532                         }
12533                     }
12534                 }
12535             }
12536         }
12537         SvREFCNT_dec(fold_intersection);
12538     }
12539
12540     /* And combine the result (if any) with any inversion list from posix
12541      * classes.  The lists are kept separate up to now because we don't want to
12542      * fold the classes (folding of those is automatically handled by the swash
12543      * fetching code) */
12544     if (posixes) {
12545         if (! DEPENDS_SEMANTICS) {
12546             if (cp_list) {
12547                 _invlist_union(cp_list, posixes, &cp_list);
12548                 SvREFCNT_dec(posixes);
12549             }
12550             else {
12551                 cp_list = posixes;
12552             }
12553         }
12554         else {
12555             /* Under /d, we put into a separate list the Latin1 things that
12556              * match only when the target string is utf8 */
12557             SV* nonascii_but_latin1_properties = NULL;
12558             _invlist_intersection(posixes, PL_Latin1,
12559                                   &nonascii_but_latin1_properties);
12560             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12561                               &nonascii_but_latin1_properties);
12562             _invlist_subtract(posixes, nonascii_but_latin1_properties,
12563                               &posixes);
12564             if (cp_list) {
12565                 _invlist_union(cp_list, posixes, &cp_list);
12566                 SvREFCNT_dec(posixes);
12567             }
12568             else {
12569                 cp_list = posixes;
12570             }
12571
12572             if (depends_list) {
12573                 _invlist_union(depends_list, nonascii_but_latin1_properties,
12574                                &depends_list);
12575                 SvREFCNT_dec(nonascii_but_latin1_properties);
12576             }
12577             else {
12578                 depends_list = nonascii_but_latin1_properties;
12579             }
12580         }
12581     }
12582
12583     /* And combine the result (if any) with any inversion list from properties.
12584      * The lists are kept separate up to now so that we can distinguish the two
12585      * in regards to matching above-Unicode.  A run-time warning is generated
12586      * if a Unicode property is matched against a non-Unicode code point. But,
12587      * we allow user-defined properties to match anything, without any warning,
12588      * and we also suppress the warning if there is a portion of the character
12589      * class that isn't a Unicode property, and which matches above Unicode, \W
12590      * or [\x{110000}] for example.
12591      * (Note that in this case, unlike the Posix one above, there is no
12592      * <depends_list>, because having a Unicode property forces Unicode
12593      * semantics */
12594     if (properties) {
12595         bool warn_super = ! has_user_defined_property;
12596         if (cp_list) {
12597
12598             /* If it matters to the final outcome, see if a non-property
12599              * component of the class matches above Unicode.  If so, the
12600              * warning gets suppressed.  This is true even if just a single
12601              * such code point is specified, as though not strictly correct if
12602              * another such code point is matched against, the fact that they
12603              * are using above-Unicode code points indicates they should know
12604              * the issues involved */
12605             if (warn_super) {
12606                 bool non_prop_matches_above_Unicode =
12607                             runtime_posix_matches_above_Unicode
12608                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12609                 if (invert) {
12610                     non_prop_matches_above_Unicode =
12611                                             !  non_prop_matches_above_Unicode;
12612                 }
12613                 warn_super = ! non_prop_matches_above_Unicode;
12614             }
12615
12616             _invlist_union(properties, cp_list, &cp_list);
12617             SvREFCNT_dec(properties);
12618         }
12619         else {
12620             cp_list = properties;
12621         }
12622
12623         if (warn_super) {
12624             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12625         }
12626     }
12627
12628     /* Here, we have calculated what code points should be in the character
12629      * class.
12630      *
12631      * Now we can see about various optimizations.  Fold calculation (which we
12632      * did above) needs to take place before inversion.  Otherwise /[^k]/i
12633      * would invert to include K, which under /i would match k, which it
12634      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
12635      * folded until runtime */
12636
12637     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12638      * at compile time.  Besides not inverting folded locale now, we can't invert
12639      * if there are things such as \w, which aren't known until runtime */
12640     if (invert
12641         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12642         && ! depends_list
12643         && ! unicode_alternate
12644         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12645     {
12646         _invlist_invert(cp_list);
12647
12648         /* Any swash can't be used as-is, because we've inverted things */
12649         if (swash) {
12650             SvREFCNT_dec(swash);
12651             swash = NULL;
12652         }
12653
12654         /* Clear the invert flag since have just done it here */
12655         invert = FALSE;
12656     }
12657
12658     /* If we didn't do folding, it's because some information isn't available
12659      * until runtime; set the run-time fold flag for these.  (We don't have to
12660      * worry about properties folding, as that is taken care of by the swash
12661      * fetching) */
12662     if (FOLD && (LOC || unicode_alternate))
12663     {
12664        ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12665     }
12666
12667     /* Some character classes are equivalent to other nodes.  Such nodes take
12668      * up less room and generally fewer operations to execute than ANYOF nodes.
12669      * Above, we checked for and optimized into some such equivalents for
12670      * certain common classes that are easy to test.  Getting to this point in
12671      * the code means that the class didn't get optimized there.  Since this
12672      * code is only executed in Pass 2, it is too late to save space--it has
12673      * been allocated in Pass 1, and currently isn't given back.  But turning
12674      * things into an EXACTish node can allow the optimizer to join it to any
12675      * adjacent such nodes.  And if the class is equivalent to things like /./,
12676      * expensive run-time swashes can be avoided.  Now that we have more
12677      * complete information, we can find things necessarily missed by the
12678      * earlier code.  I (khw) am not sure how much to look for here.  It would
12679      * be easy, but perhaps too slow, to check any candidates against all the
12680      * node types they could possibly match using _invlistEQ(). */
12681
12682     if (cp_list
12683         && ! unicode_alternate
12684         && ! invert
12685         && ! depends_list
12686         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12687         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12688     {
12689        UV start, end;
12690        U8 op = END;  /* The optimzation node-type */
12691         const char * cur_parse= RExC_parse;
12692
12693        invlist_iterinit(cp_list);
12694        if (! invlist_iternext(cp_list, &start, &end)) {
12695
12696             /* Here, the list is empty.  This happens, for example, when a
12697              * Unicode property is the only thing in the character class, and
12698              * it doesn't match anything.  (perluniprops.pod notes such
12699              * properties) */
12700             op = OPFAIL;
12701             *flagp |= HASWIDTH|SIMPLE;
12702         }
12703         else if (start == end) {    /* The range is a single code point */
12704             if (! invlist_iternext(cp_list, &start, &end)
12705
12706                     /* Don't do this optimization if it would require changing
12707                      * the pattern to UTF-8 */
12708                 && (start < 256 || UTF))
12709             {
12710                 /* Here, the list contains a single code point.  Can optimize
12711                  * into an EXACT node */
12712
12713                 value = start;
12714
12715                 if (! FOLD) {
12716                     op = EXACT;
12717                 }
12718                 else if (LOC) {
12719
12720                     /* A locale node under folding with one code point can be
12721                      * an EXACTFL, as its fold won't be calculated until
12722                      * runtime */
12723                     op = EXACTFL;
12724                 }
12725                 else {
12726
12727                     /* Here, we are generally folding, but there is only one
12728                      * code point to match.  If we have to, we use an EXACT
12729                      * node, but it would be better for joining with adjacent
12730                      * nodes in the optimization pass if we used the same
12731                      * EXACTFish node that any such are likely to be.  We can
12732                      * do this iff the code point doesn't participate in any
12733                      * folds.  For example, an EXACTF of a colon is the same as
12734                      * an EXACT one, since nothing folds to or from a colon.
12735                      * In the Latin1 range, being an alpha means that the
12736                      * character participates in a fold (except for the
12737                      * feminine and masculine ordinals, which I (khw) don't
12738                      * think are worrying about optimizing for). */
12739                     if (value < 256) {
12740                         if (isALPHA_L1(value)) {
12741                             op = EXACT;
12742                         }
12743                     }
12744                     else {
12745                         if (! PL_utf8_foldable) {
12746                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12747                                                 &PL_sv_undef, 1, 0);
12748                             PL_utf8_foldable = _get_swash_invlist(swash);
12749                             SvREFCNT_dec(swash);
12750                         }
12751                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
12752                             op = EXACT;
12753                         }
12754                     }
12755
12756                     /* If we haven't found the node type, above, it means we
12757                      * can use the prevailing one */
12758                     if (op == END) {
12759                         op = compute_EXACTish(pRExC_state);
12760                     }
12761                 }
12762             }
12763         }
12764         else if (start == 0) {
12765             if (end == UV_MAX) {
12766                 op = SANY;
12767                 *flagp |= HASWIDTH|SIMPLE;
12768                 RExC_naughty++;
12769             }
12770             else if (end == '\n' - 1
12771                     && invlist_iternext(cp_list, &start, &end)
12772                     && start == '\n' + 1 && end == UV_MAX)
12773             {
12774                 op = REG_ANY;
12775                 *flagp |= HASWIDTH|SIMPLE;
12776                 RExC_naughty++;
12777             }
12778         }
12779
12780         if (op != END) {
12781             RExC_parse = (char *)orig_parse;
12782             RExC_emit = (regnode *)orig_emit;
12783
12784             ret = reg_node(pRExC_state, op);
12785
12786             RExC_parse = (char *)cur_parse;
12787
12788             if (PL_regkind[op] == EXACT) {
12789                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12790             }
12791
12792             SvREFCNT_dec(listsv);
12793             return ret;
12794         }
12795     }
12796
12797     /* Here, <cp_list> contains all the code points we can determine at
12798      * compile time that match under all conditions.  Go through it, and
12799      * for things that belong in the bitmap, put them there, and delete from
12800      * <cp_list>.  While we are at it, see if everything above 255 is in the
12801      * list, and if so, set a flag to speed up execution */
12802     ANYOF_BITMAP_ZERO(ret);
12803     if (cp_list) {
12804
12805         /* This gets set if we actually need to modify things */
12806         bool change_invlist = FALSE;
12807
12808         UV start, end;
12809
12810         /* Start looking through <cp_list> */
12811         invlist_iterinit(cp_list);
12812         while (invlist_iternext(cp_list, &start, &end)) {
12813             UV high;
12814             int i;
12815
12816             if (end == UV_MAX && start <= 256) {
12817                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12818             }
12819
12820             /* Quit if are above what we should change */
12821             if (start > 255) {
12822                 break;
12823             }
12824
12825             change_invlist = TRUE;
12826
12827             /* Set all the bits in the range, up to the max that we are doing */
12828             high = (end < 255) ? end : 255;
12829             for (i = start; i <= (int) high; i++) {
12830                 if (! ANYOF_BITMAP_TEST(ret, i)) {
12831                     ANYOF_BITMAP_SET(ret, i);
12832                     prevvalue = value;
12833                     value = i;
12834                 }
12835             }
12836         }
12837
12838         /* Done with loop; remove any code points that are in the bitmap from
12839          * <cp_list> */
12840         if (change_invlist) {
12841             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12842         }
12843
12844         /* If have completely emptied it, remove it completely */
12845         if (invlist_len(cp_list) == 0) {
12846             SvREFCNT_dec(cp_list);
12847             cp_list = NULL;
12848         }
12849     }
12850
12851     if (invert) {
12852         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
12853     }
12854
12855     /* Here, the bitmap has been populated with all the Latin1 code points that
12856      * always match.  Can now add to the overall list those that match only
12857      * when the target string is UTF-8 (<depends_list>). */
12858     if (depends_list) {
12859         if (cp_list) {
12860             _invlist_union(cp_list, depends_list, &cp_list);
12861             SvREFCNT_dec(depends_list);
12862         }
12863         else {
12864             cp_list = depends_list;
12865         }
12866     }
12867
12868     /* If there is a swash and more than one element, we can't use the swash in
12869      * the optimization below. */
12870     if (swash && element_count > 1) {
12871         SvREFCNT_dec(swash);
12872         swash = NULL;
12873     }
12874
12875     if (! cp_list
12876         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12877         && ! unicode_alternate)
12878     {
12879         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12880         SvREFCNT_dec(listsv);
12881         SvREFCNT_dec(unicode_alternate);
12882     }
12883     else {
12884         /* av[0] stores the character class description in its textual form:
12885          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
12886          *       appropriate swash, and is also useful for dumping the regnode.
12887          * av[1] if NULL, is a placeholder to later contain the swash computed
12888          *       from av[0].  But if no further computation need be done, the
12889          *       swash is stored there now.
12890          * av[2] stores the multicharacter foldings, used later in
12891          *       regexec.c:S_reginclass().
12892          * av[3] stores the cp_list inversion list for use in addition or
12893          *       instead of av[0]; used only if av[1] is NULL
12894          * av[4] is set if any component of the class is from a user-defined
12895          *       property; used only if av[1] is NULL */
12896         AV * const av = newAV();
12897         SV *rv;
12898
12899         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12900                         ? listsv
12901                         : &PL_sv_undef);
12902         if (swash) {
12903             av_store(av, 1, swash);
12904             SvREFCNT_dec(cp_list);
12905         }
12906         else {
12907             av_store(av, 1, NULL);
12908             if (cp_list) {
12909                 av_store(av, 3, cp_list);
12910                 av_store(av, 4, newSVuv(has_user_defined_property));
12911             }
12912         }
12913
12914         /* Store any computed multi-char folds only if we are allowing
12915          * them */
12916         if (allow_full_fold) {
12917             av_store(av, 2, MUTABLE_SV(unicode_alternate));
12918             if (unicode_alternate) { /* This node is variable length */
12919                 OP(ret) = ANYOFV;
12920             }
12921         }
12922         else {
12923             av_store(av, 2, NULL);
12924         }
12925         rv = newRV_noinc(MUTABLE_SV(av));
12926         n = add_data(pRExC_state, 1, "s");
12927         RExC_rxi->data->data[n] = (void*)rv;
12928         ARG_SET(ret, n);
12929     }
12930
12931     *flagp |= HASWIDTH|SIMPLE;
12932     return ret;
12933 }
12934 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12935
12936
12937 /* reg_skipcomment()
12938
12939    Absorbs an /x style # comments from the input stream.
12940    Returns true if there is more text remaining in the stream.
12941    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12942    terminates the pattern without including a newline.
12943
12944    Note its the callers responsibility to ensure that we are
12945    actually in /x mode
12946
12947 */
12948
12949 STATIC bool
12950 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12951 {
12952     bool ended = 0;
12953
12954     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12955
12956     while (RExC_parse < RExC_end)
12957         if (*RExC_parse++ == '\n') {
12958             ended = 1;
12959             break;
12960         }
12961     if (!ended) {
12962         /* we ran off the end of the pattern without ending
12963            the comment, so we have to add an \n when wrapping */
12964         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12965         return 0;
12966     } else
12967         return 1;
12968 }
12969
12970 /* nextchar()
12971
12972    Advances the parse position, and optionally absorbs
12973    "whitespace" from the inputstream.
12974
12975    Without /x "whitespace" means (?#...) style comments only,
12976    with /x this means (?#...) and # comments and whitespace proper.
12977
12978    Returns the RExC_parse point from BEFORE the scan occurs.
12979
12980    This is the /x friendly way of saying RExC_parse++.
12981 */
12982
12983 STATIC char*
12984 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12985 {
12986     char* const retval = RExC_parse++;
12987
12988     PERL_ARGS_ASSERT_NEXTCHAR;
12989
12990     for (;;) {
12991         if (RExC_end - RExC_parse >= 3
12992             && *RExC_parse == '('
12993             && RExC_parse[1] == '?'
12994             && RExC_parse[2] == '#')
12995         {
12996             while (*RExC_parse != ')') {
12997                 if (RExC_parse == RExC_end)
12998                     FAIL("Sequence (?#... not terminated");
12999                 RExC_parse++;
13000             }
13001             RExC_parse++;
13002             continue;
13003         }
13004         if (RExC_flags & RXf_PMf_EXTENDED) {
13005             if (isSPACE(*RExC_parse)) {
13006                 RExC_parse++;
13007                 continue;
13008             }
13009             else if (*RExC_parse == '#') {
13010                 if ( reg_skipcomment( pRExC_state ) )
13011                     continue;
13012             }
13013         }
13014         return retval;
13015     }
13016 }
13017
13018 /*
13019 - reg_node - emit a node
13020 */
13021 STATIC regnode *                        /* Location. */
13022 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13023 {
13024     dVAR;
13025     regnode *ptr;
13026     regnode * const ret = RExC_emit;
13027     GET_RE_DEBUG_FLAGS_DECL;
13028
13029     PERL_ARGS_ASSERT_REG_NODE;
13030
13031     if (SIZE_ONLY) {
13032         SIZE_ALIGN(RExC_size);
13033         RExC_size += 1;
13034         return(ret);
13035     }
13036     if (RExC_emit >= RExC_emit_bound)
13037         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13038                    op, RExC_emit, RExC_emit_bound);
13039
13040     NODE_ALIGN_FILL(ret);
13041     ptr = ret;
13042     FILL_ADVANCE_NODE(ptr, op);
13043 #ifdef RE_TRACK_PATTERN_OFFSETS
13044     if (RExC_offsets) {         /* MJD */
13045         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13046               "reg_node", __LINE__,
13047               PL_reg_name[op],
13048               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13049                 ? "Overwriting end of array!\n" : "OK",
13050               (UV)(RExC_emit - RExC_emit_start),
13051               (UV)(RExC_parse - RExC_start),
13052               (UV)RExC_offsets[0]));
13053         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13054     }
13055 #endif
13056     RExC_emit = ptr;
13057     return(ret);
13058 }
13059
13060 /*
13061 - reganode - emit a node with an argument
13062 */
13063 STATIC regnode *                        /* Location. */
13064 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13065 {
13066     dVAR;
13067     regnode *ptr;
13068     regnode * const ret = RExC_emit;
13069     GET_RE_DEBUG_FLAGS_DECL;
13070
13071     PERL_ARGS_ASSERT_REGANODE;
13072
13073     if (SIZE_ONLY) {
13074         SIZE_ALIGN(RExC_size);
13075         RExC_size += 2;
13076         /*
13077            We can't do this:
13078
13079            assert(2==regarglen[op]+1);
13080
13081            Anything larger than this has to allocate the extra amount.
13082            If we changed this to be:
13083
13084            RExC_size += (1 + regarglen[op]);
13085
13086            then it wouldn't matter. Its not clear what side effect
13087            might come from that so its not done so far.
13088            -- dmq
13089         */
13090         return(ret);
13091     }
13092     if (RExC_emit >= RExC_emit_bound)
13093         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13094                    op, RExC_emit, RExC_emit_bound);
13095
13096     NODE_ALIGN_FILL(ret);
13097     ptr = ret;
13098     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13099 #ifdef RE_TRACK_PATTERN_OFFSETS
13100     if (RExC_offsets) {         /* MJD */
13101         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13102               "reganode",
13103               __LINE__,
13104               PL_reg_name[op],
13105               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13106               "Overwriting end of array!\n" : "OK",
13107               (UV)(RExC_emit - RExC_emit_start),
13108               (UV)(RExC_parse - RExC_start),
13109               (UV)RExC_offsets[0]));
13110         Set_Cur_Node_Offset;
13111     }
13112 #endif
13113     RExC_emit = ptr;
13114     return(ret);
13115 }
13116
13117 /*
13118 - reguni - emit (if appropriate) a Unicode character
13119 */
13120 STATIC STRLEN
13121 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13122 {
13123     dVAR;
13124
13125     PERL_ARGS_ASSERT_REGUNI;
13126
13127     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13128 }
13129
13130 /*
13131 - reginsert - insert an operator in front of already-emitted operand
13132 *
13133 * Means relocating the operand.
13134 */
13135 STATIC void
13136 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13137 {
13138     dVAR;
13139     regnode *src;
13140     regnode *dst;
13141     regnode *place;
13142     const int offset = regarglen[(U8)op];
13143     const int size = NODE_STEP_REGNODE + offset;
13144     GET_RE_DEBUG_FLAGS_DECL;
13145
13146     PERL_ARGS_ASSERT_REGINSERT;
13147     PERL_UNUSED_ARG(depth);
13148 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13149     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13150     if (SIZE_ONLY) {
13151         RExC_size += size;
13152         return;
13153     }
13154
13155     src = RExC_emit;
13156     RExC_emit += size;
13157     dst = RExC_emit;
13158     if (RExC_open_parens) {
13159         int paren;
13160         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13161         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13162             if ( RExC_open_parens[paren] >= opnd ) {
13163                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13164                 RExC_open_parens[paren] += size;
13165             } else {
13166                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13167             }
13168             if ( RExC_close_parens[paren] >= opnd ) {
13169                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13170                 RExC_close_parens[paren] += size;
13171             } else {
13172                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13173             }
13174         }
13175     }
13176
13177     while (src > opnd) {
13178         StructCopy(--src, --dst, regnode);
13179 #ifdef RE_TRACK_PATTERN_OFFSETS
13180         if (RExC_offsets) {     /* MJD 20010112 */
13181             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13182                   "reg_insert",
13183                   __LINE__,
13184                   PL_reg_name[op],
13185                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13186                     ? "Overwriting end of array!\n" : "OK",
13187                   (UV)(src - RExC_emit_start),
13188                   (UV)(dst - RExC_emit_start),
13189                   (UV)RExC_offsets[0]));
13190             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13191             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13192         }
13193 #endif
13194     }
13195
13196
13197     place = opnd;               /* Op node, where operand used to be. */
13198 #ifdef RE_TRACK_PATTERN_OFFSETS
13199     if (RExC_offsets) {         /* MJD */
13200         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13201               "reginsert",
13202               __LINE__,
13203               PL_reg_name[op],
13204               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13205               ? "Overwriting end of array!\n" : "OK",
13206               (UV)(place - RExC_emit_start),
13207               (UV)(RExC_parse - RExC_start),
13208               (UV)RExC_offsets[0]));
13209         Set_Node_Offset(place, RExC_parse);
13210         Set_Node_Length(place, 1);
13211     }
13212 #endif
13213     src = NEXTOPER(place);
13214     FILL_ADVANCE_NODE(place, op);
13215     Zero(src, offset, regnode);
13216 }
13217
13218 /*
13219 - regtail - set the next-pointer at the end of a node chain of p to val.
13220 - SEE ALSO: regtail_study
13221 */
13222 /* TODO: All three parms should be const */
13223 STATIC void
13224 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13225 {
13226     dVAR;
13227     regnode *scan;
13228     GET_RE_DEBUG_FLAGS_DECL;
13229
13230     PERL_ARGS_ASSERT_REGTAIL;
13231 #ifndef DEBUGGING
13232     PERL_UNUSED_ARG(depth);
13233 #endif
13234
13235     if (SIZE_ONLY)
13236         return;
13237
13238     /* Find last node. */
13239     scan = p;
13240     for (;;) {
13241         regnode * const temp = regnext(scan);
13242         DEBUG_PARSE_r({
13243             SV * const mysv=sv_newmortal();
13244             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13245             regprop(RExC_rx, mysv, scan);
13246             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13247                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13248                     (temp == NULL ? "->" : ""),
13249                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13250             );
13251         });
13252         if (temp == NULL)
13253             break;
13254         scan = temp;
13255     }
13256
13257     if (reg_off_by_arg[OP(scan)]) {
13258         ARG_SET(scan, val - scan);
13259     }
13260     else {
13261         NEXT_OFF(scan) = val - scan;
13262     }
13263 }
13264
13265 #ifdef DEBUGGING
13266 /*
13267 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13268 - Look for optimizable sequences at the same time.
13269 - currently only looks for EXACT chains.
13270
13271 This is experimental code. The idea is to use this routine to perform
13272 in place optimizations on branches and groups as they are constructed,
13273 with the long term intention of removing optimization from study_chunk so
13274 that it is purely analytical.
13275
13276 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13277 to control which is which.
13278
13279 */
13280 /* TODO: All four parms should be const */
13281
13282 STATIC U8
13283 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13284 {
13285     dVAR;
13286     regnode *scan;
13287     U8 exact = PSEUDO;
13288 #ifdef EXPERIMENTAL_INPLACESCAN
13289     I32 min = 0;
13290 #endif
13291     GET_RE_DEBUG_FLAGS_DECL;
13292
13293     PERL_ARGS_ASSERT_REGTAIL_STUDY;
13294
13295
13296     if (SIZE_ONLY)
13297         return exact;
13298
13299     /* Find last node. */
13300
13301     scan = p;
13302     for (;;) {
13303         regnode * const temp = regnext(scan);
13304 #ifdef EXPERIMENTAL_INPLACESCAN
13305         if (PL_regkind[OP(scan)] == EXACT) {
13306             bool has_exactf_sharp_s;    /* Unexamined in this routine */
13307             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13308                 return EXACT;
13309         }
13310 #endif
13311         if ( exact ) {
13312             switch (OP(scan)) {
13313                 case EXACT:
13314                 case EXACTF:
13315                 case EXACTFA:
13316                 case EXACTFU:
13317                 case EXACTFU_SS:
13318                 case EXACTFU_TRICKYFOLD:
13319                 case EXACTFL:
13320                         if( exact == PSEUDO )
13321                             exact= OP(scan);
13322                         else if ( exact != OP(scan) )
13323                             exact= 0;
13324                 case NOTHING:
13325                     break;
13326                 default:
13327                     exact= 0;
13328             }
13329         }
13330         DEBUG_PARSE_r({
13331             SV * const mysv=sv_newmortal();
13332             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13333             regprop(RExC_rx, mysv, scan);
13334             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13335                 SvPV_nolen_const(mysv),
13336                 REG_NODE_NUM(scan),
13337                 PL_reg_name[exact]);
13338         });
13339         if (temp == NULL)
13340             break;
13341         scan = temp;
13342     }
13343     DEBUG_PARSE_r({
13344         SV * const mysv_val=sv_newmortal();
13345         DEBUG_PARSE_MSG("");
13346         regprop(RExC_rx, mysv_val, val);
13347         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13348                       SvPV_nolen_const(mysv_val),
13349                       (IV)REG_NODE_NUM(val),
13350                       (IV)(val - scan)
13351         );
13352     });
13353     if (reg_off_by_arg[OP(scan)]) {
13354         ARG_SET(scan, val - scan);
13355     }
13356     else {
13357         NEXT_OFF(scan) = val - scan;
13358     }
13359
13360     return exact;
13361 }
13362 #endif
13363
13364 /*
13365  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13366  */
13367 #ifdef DEBUGGING
13368 static void
13369 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13370 {
13371     int bit;
13372     int set=0;
13373     regex_charset cs;
13374
13375     for (bit=0; bit<32; bit++) {
13376         if (flags & (1<<bit)) {
13377             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
13378                 continue;
13379             }
13380             if (!set++ && lead)
13381                 PerlIO_printf(Perl_debug_log, "%s",lead);
13382             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13383         }
13384     }
13385     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13386             if (!set++ && lead) {
13387                 PerlIO_printf(Perl_debug_log, "%s",lead);
13388             }
13389             switch (cs) {
13390                 case REGEX_UNICODE_CHARSET:
13391                     PerlIO_printf(Perl_debug_log, "UNICODE");
13392                     break;
13393                 case REGEX_LOCALE_CHARSET:
13394                     PerlIO_printf(Perl_debug_log, "LOCALE");
13395                     break;
13396                 case REGEX_ASCII_RESTRICTED_CHARSET:
13397                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13398                     break;
13399                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13400                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13401                     break;
13402                 default:
13403                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13404                     break;
13405             }
13406     }
13407     if (lead)  {
13408         if (set)
13409             PerlIO_printf(Perl_debug_log, "\n");
13410         else
13411             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13412     }
13413 }
13414 #endif
13415
13416 void
13417 Perl_regdump(pTHX_ const regexp *r)
13418 {
13419 #ifdef DEBUGGING
13420     dVAR;
13421     SV * const sv = sv_newmortal();
13422     SV *dsv= sv_newmortal();
13423     RXi_GET_DECL(r,ri);
13424     GET_RE_DEBUG_FLAGS_DECL;
13425
13426     PERL_ARGS_ASSERT_REGDUMP;
13427
13428     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13429
13430     /* Header fields of interest. */
13431     if (r->anchored_substr) {
13432         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13433             RE_SV_DUMPLEN(r->anchored_substr), 30);
13434         PerlIO_printf(Perl_debug_log,
13435                       "anchored %s%s at %"IVdf" ",
13436                       s, RE_SV_TAIL(r->anchored_substr),
13437                       (IV)r->anchored_offset);
13438     } else if (r->anchored_utf8) {
13439         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13440             RE_SV_DUMPLEN(r->anchored_utf8), 30);
13441         PerlIO_printf(Perl_debug_log,
13442                       "anchored utf8 %s%s at %"IVdf" ",
13443                       s, RE_SV_TAIL(r->anchored_utf8),
13444                       (IV)r->anchored_offset);
13445     }
13446     if (r->float_substr) {
13447         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13448             RE_SV_DUMPLEN(r->float_substr), 30);
13449         PerlIO_printf(Perl_debug_log,
13450                       "floating %s%s at %"IVdf"..%"UVuf" ",
13451                       s, RE_SV_TAIL(r->float_substr),
13452                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13453     } else if (r->float_utf8) {
13454         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13455             RE_SV_DUMPLEN(r->float_utf8), 30);
13456         PerlIO_printf(Perl_debug_log,
13457                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13458                       s, RE_SV_TAIL(r->float_utf8),
13459                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13460     }
13461     if (r->check_substr || r->check_utf8)
13462         PerlIO_printf(Perl_debug_log,
13463                       (const char *)
13464                       (r->check_substr == r->float_substr
13465                        && r->check_utf8 == r->float_utf8
13466                        ? "(checking floating" : "(checking anchored"));
13467     if (r->extflags & RXf_NOSCAN)
13468         PerlIO_printf(Perl_debug_log, " noscan");
13469     if (r->extflags & RXf_CHECK_ALL)
13470         PerlIO_printf(Perl_debug_log, " isall");
13471     if (r->check_substr || r->check_utf8)
13472         PerlIO_printf(Perl_debug_log, ") ");
13473
13474     if (ri->regstclass) {
13475         regprop(r, sv, ri->regstclass);
13476         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13477     }
13478     if (r->extflags & RXf_ANCH) {
13479         PerlIO_printf(Perl_debug_log, "anchored");
13480         if (r->extflags & RXf_ANCH_BOL)
13481             PerlIO_printf(Perl_debug_log, "(BOL)");
13482         if (r->extflags & RXf_ANCH_MBOL)
13483             PerlIO_printf(Perl_debug_log, "(MBOL)");
13484         if (r->extflags & RXf_ANCH_SBOL)
13485             PerlIO_printf(Perl_debug_log, "(SBOL)");
13486         if (r->extflags & RXf_ANCH_GPOS)
13487             PerlIO_printf(Perl_debug_log, "(GPOS)");
13488         PerlIO_putc(Perl_debug_log, ' ');
13489     }
13490     if (r->extflags & RXf_GPOS_SEEN)
13491         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13492     if (r->intflags & PREGf_SKIP)
13493         PerlIO_printf(Perl_debug_log, "plus ");
13494     if (r->intflags & PREGf_IMPLICIT)
13495         PerlIO_printf(Perl_debug_log, "implicit ");
13496     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13497     if (r->extflags & RXf_EVAL_SEEN)
13498         PerlIO_printf(Perl_debug_log, "with eval ");
13499     PerlIO_printf(Perl_debug_log, "\n");
13500     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13501 #else
13502     PERL_ARGS_ASSERT_REGDUMP;
13503     PERL_UNUSED_CONTEXT;
13504     PERL_UNUSED_ARG(r);
13505 #endif  /* DEBUGGING */
13506 }
13507
13508 /*
13509 - regprop - printable representation of opcode
13510 */
13511 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13512 STMT_START { \
13513         if (do_sep) {                           \
13514             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13515             if (flags & ANYOF_INVERT)           \
13516                 /*make sure the invert info is in each */ \
13517                 sv_catpvs(sv, "^");             \
13518             do_sep = 0;                         \
13519         }                                       \
13520 } STMT_END
13521
13522 void
13523 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13524 {
13525 #ifdef DEBUGGING
13526     dVAR;
13527     int k;
13528
13529     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13530     static const char * const anyofs[] = {
13531         "\\w",
13532         "\\W",
13533         "\\s",
13534         "\\S",
13535         "\\d",
13536         "\\D",
13537         "[:alnum:]",
13538         "[:^alnum:]",
13539         "[:alpha:]",
13540         "[:^alpha:]",
13541         "[:ascii:]",
13542         "[:^ascii:]",
13543         "[:cntrl:]",
13544         "[:^cntrl:]",
13545         "[:graph:]",
13546         "[:^graph:]",
13547         "[:lower:]",
13548         "[:^lower:]",
13549         "[:print:]",
13550         "[:^print:]",
13551         "[:punct:]",
13552         "[:^punct:]",
13553         "[:upper:]",
13554         "[:^upper:]",
13555         "[:xdigit:]",
13556         "[:^xdigit:]",
13557         "[:space:]",
13558         "[:^space:]",
13559         "[:blank:]",
13560         "[:^blank:]"
13561     };
13562     RXi_GET_DECL(prog,progi);
13563     GET_RE_DEBUG_FLAGS_DECL;
13564
13565     PERL_ARGS_ASSERT_REGPROP;
13566
13567     sv_setpvs(sv, "");
13568
13569     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
13570         /* It would be nice to FAIL() here, but this may be called from
13571            regexec.c, and it would be hard to supply pRExC_state. */
13572         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13573     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13574
13575     k = PL_regkind[OP(o)];
13576
13577     if (k == EXACT) {
13578         sv_catpvs(sv, " ");
13579         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13580          * is a crude hack but it may be the best for now since
13581          * we have no flag "this EXACTish node was UTF-8"
13582          * --jhi */
13583         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13584                   PERL_PV_ESCAPE_UNI_DETECT |
13585                   PERL_PV_ESCAPE_NONASCII   |
13586                   PERL_PV_PRETTY_ELLIPSES   |
13587                   PERL_PV_PRETTY_LTGT       |
13588                   PERL_PV_PRETTY_NOCLEAR
13589                   );
13590     } else if (k == TRIE) {
13591         /* print the details of the trie in dumpuntil instead, as
13592          * progi->data isn't available here */
13593         const char op = OP(o);
13594         const U32 n = ARG(o);
13595         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13596                (reg_ac_data *)progi->data->data[n] :
13597                NULL;
13598         const reg_trie_data * const trie
13599             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13600
13601         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13602         DEBUG_TRIE_COMPILE_r(
13603             Perl_sv_catpvf(aTHX_ sv,
13604                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13605                 (UV)trie->startstate,
13606                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13607                 (UV)trie->wordcount,
13608                 (UV)trie->minlen,
13609                 (UV)trie->maxlen,
13610                 (UV)TRIE_CHARCOUNT(trie),
13611                 (UV)trie->uniquecharcount
13612             )
13613         );
13614         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13615             int i;
13616             int rangestart = -1;
13617             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13618             sv_catpvs(sv, "[");
13619             for (i = 0; i <= 256; i++) {
13620                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13621                     if (rangestart == -1)
13622                         rangestart = i;
13623                 } else if (rangestart != -1) {
13624                     if (i <= rangestart + 3)
13625                         for (; rangestart < i; rangestart++)
13626                             put_byte(sv, rangestart);
13627                     else {
13628                         put_byte(sv, rangestart);
13629                         sv_catpvs(sv, "-");
13630                         put_byte(sv, i - 1);
13631                     }
13632                     rangestart = -1;
13633                 }
13634             }
13635             sv_catpvs(sv, "]");
13636         }
13637
13638     } else if (k == CURLY) {
13639         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13640             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13641         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13642     }
13643     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
13644         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13645     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13646         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
13647         if ( RXp_PAREN_NAMES(prog) ) {
13648             if ( k != REF || (OP(o) < NREF)) {
13649                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13650                 SV **name= av_fetch(list, ARG(o), 0 );
13651                 if (name)
13652                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13653             }
13654             else {
13655                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13656                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13657                 I32 *nums=(I32*)SvPVX(sv_dat);
13658                 SV **name= av_fetch(list, nums[0], 0 );
13659                 I32 n;
13660                 if (name) {
13661                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
13662                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13663                                     (n ? "," : ""), (IV)nums[n]);
13664                     }
13665                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13666                 }
13667             }
13668         }
13669     } else if (k == GOSUB)
13670         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13671     else if (k == VERB) {
13672         if (!o->flags)
13673             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13674                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13675     } else if (k == LOGICAL)
13676         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
13677     else if (k == ANYOF) {
13678         int i, rangestart = -1;
13679         const U8 flags = ANYOF_FLAGS(o);
13680         int do_sep = 0;
13681
13682
13683         if (flags & ANYOF_LOCALE)
13684             sv_catpvs(sv, "{loc}");
13685         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13686             sv_catpvs(sv, "{i}");
13687         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13688         if (flags & ANYOF_INVERT)
13689             sv_catpvs(sv, "^");
13690
13691         /* output what the standard cp 0-255 bitmap matches */
13692         for (i = 0; i <= 256; i++) {
13693             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13694                 if (rangestart == -1)
13695                     rangestart = i;
13696             } else if (rangestart != -1) {
13697                 if (i <= rangestart + 3)
13698                     for (; rangestart < i; rangestart++)
13699                         put_byte(sv, rangestart);
13700                 else {
13701                     put_byte(sv, rangestart);
13702                     sv_catpvs(sv, "-");
13703                     put_byte(sv, i - 1);
13704                 }
13705                 do_sep = 1;
13706                 rangestart = -1;
13707             }
13708         }
13709
13710         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13711         /* output any special charclass tests (used entirely under use locale) */
13712         if (ANYOF_CLASS_TEST_ANY_SET(o))
13713             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13714                 if (ANYOF_CLASS_TEST(o,i)) {
13715                     sv_catpv(sv, anyofs[i]);
13716                     do_sep = 1;
13717                 }
13718
13719         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13720
13721         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13722             sv_catpvs(sv, "{non-utf8-latin1-all}");
13723         }
13724
13725         /* output information about the unicode matching */
13726         if (flags & ANYOF_UNICODE_ALL)
13727             sv_catpvs(sv, "{unicode_all}");
13728         else if (ANYOF_NONBITMAP(o))
13729             sv_catpvs(sv, "{unicode}");
13730         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13731             sv_catpvs(sv, "{outside bitmap}");
13732
13733         if (ANYOF_NONBITMAP(o)) {
13734             SV *lv; /* Set if there is something outside the bit map */
13735             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13736             bool byte_output = FALSE;   /* If something in the bitmap has been
13737                                            output */
13738
13739             if (lv && lv != &PL_sv_undef) {
13740                 if (sw) {
13741                     U8 s[UTF8_MAXBYTES_CASE+1];
13742
13743                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13744                         uvchr_to_utf8(s, i);
13745
13746                         if (i < 256
13747                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13748                                                                things already
13749                                                                output as part
13750                                                                of the bitmap */
13751                             && swash_fetch(sw, s, TRUE))
13752                         {
13753                             if (rangestart == -1)
13754                                 rangestart = i;
13755                         } else if (rangestart != -1) {
13756                             byte_output = TRUE;
13757                             if (i <= rangestart + 3)
13758                                 for (; rangestart < i; rangestart++) {
13759                                     put_byte(sv, rangestart);
13760                                 }
13761                             else {
13762                                 put_byte(sv, rangestart);
13763                                 sv_catpvs(sv, "-");
13764                                 put_byte(sv, i-1);
13765                             }
13766                             rangestart = -1;
13767                         }
13768                     }
13769                 }
13770
13771                 {
13772                     char *s = savesvpv(lv);
13773                     char * const origs = s;
13774
13775                     while (*s && *s != '\n')
13776                         s++;
13777
13778                     if (*s == '\n') {
13779                         const char * const t = ++s;
13780
13781                         if (byte_output) {
13782                             sv_catpvs(sv, " ");
13783                         }
13784
13785                         while (*s) {
13786                             if (*s == '\n') {
13787
13788                                 /* Truncate very long output */
13789                                 if (s - origs > 256) {
13790                                     Perl_sv_catpvf(aTHX_ sv,
13791                                                    "%.*s...",
13792                                                    (int) (s - origs - 1),
13793                                                    t);
13794                                     goto out_dump;
13795                                 }
13796                                 *s = ' ';
13797                             }
13798                             else if (*s == '\t') {
13799                                 *s = '-';
13800                             }
13801                             s++;
13802                         }
13803                         if (s[-1] == ' ')
13804                             s[-1] = 0;
13805
13806                         sv_catpv(sv, t);
13807                     }
13808
13809                 out_dump:
13810
13811                     Safefree(origs);
13812                 }
13813                 SvREFCNT_dec(lv);
13814             }
13815         }
13816
13817         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13818     }
13819     else if (k == POSIXD) {
13820         U8 index = FLAGS(o) * 2;
13821         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
13822             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
13823         }
13824         else {
13825             sv_catpv(sv, anyofs[index]);
13826         }
13827     }
13828     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13829         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13830 #else
13831     PERL_UNUSED_CONTEXT;
13832     PERL_UNUSED_ARG(sv);
13833     PERL_UNUSED_ARG(o);
13834     PERL_UNUSED_ARG(prog);
13835 #endif  /* DEBUGGING */
13836 }
13837
13838 SV *
13839 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13840 {                               /* Assume that RE_INTUIT is set */
13841     dVAR;
13842     struct regexp *const prog = (struct regexp *)SvANY(r);
13843     GET_RE_DEBUG_FLAGS_DECL;
13844
13845     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13846     PERL_UNUSED_CONTEXT;
13847
13848     DEBUG_COMPILE_r(
13849         {
13850             const char * const s = SvPV_nolen_const(prog->check_substr
13851                       ? prog->check_substr : prog->check_utf8);
13852
13853             if (!PL_colorset) reginitcolors();
13854             PerlIO_printf(Perl_debug_log,
13855                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13856                       PL_colors[4],
13857                       prog->check_substr ? "" : "utf8 ",
13858                       PL_colors[5],PL_colors[0],
13859                       s,
13860                       PL_colors[1],
13861                       (strlen(s) > 60 ? "..." : ""));
13862         } );
13863
13864     return prog->check_substr ? prog->check_substr : prog->check_utf8;
13865 }
13866
13867 /*
13868    pregfree()
13869
13870    handles refcounting and freeing the perl core regexp structure. When
13871    it is necessary to actually free the structure the first thing it
13872    does is call the 'free' method of the regexp_engine associated to
13873    the regexp, allowing the handling of the void *pprivate; member
13874    first. (This routine is not overridable by extensions, which is why
13875    the extensions free is called first.)
13876
13877    See regdupe and regdupe_internal if you change anything here.
13878 */
13879 #ifndef PERL_IN_XSUB_RE
13880 void
13881 Perl_pregfree(pTHX_ REGEXP *r)
13882 {
13883     SvREFCNT_dec(r);
13884 }
13885
13886 void
13887 Perl_pregfree2(pTHX_ REGEXP *rx)
13888 {
13889     dVAR;
13890     struct regexp *const r = (struct regexp *)SvANY(rx);
13891     GET_RE_DEBUG_FLAGS_DECL;
13892
13893     PERL_ARGS_ASSERT_PREGFREE2;
13894
13895     if (r->mother_re) {
13896         ReREFCNT_dec(r->mother_re);
13897     } else {
13898         CALLREGFREE_PVT(rx); /* free the private data */
13899         SvREFCNT_dec(RXp_PAREN_NAMES(r));
13900     }
13901     if (r->substrs) {
13902         SvREFCNT_dec(r->anchored_substr);
13903         SvREFCNT_dec(r->anchored_utf8);
13904         SvREFCNT_dec(r->float_substr);
13905         SvREFCNT_dec(r->float_utf8);
13906         Safefree(r->substrs);
13907     }
13908     RX_MATCH_COPY_FREE(rx);
13909 #ifdef PERL_OLD_COPY_ON_WRITE
13910     SvREFCNT_dec(r->saved_copy);
13911 #endif
13912     Safefree(r->offs);
13913     SvREFCNT_dec(r->qr_anoncv);
13914 }
13915
13916 /*  reg_temp_copy()
13917
13918     This is a hacky workaround to the structural issue of match results
13919     being stored in the regexp structure which is in turn stored in
13920     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13921     could be PL_curpm in multiple contexts, and could require multiple
13922     result sets being associated with the pattern simultaneously, such
13923     as when doing a recursive match with (??{$qr})
13924
13925     The solution is to make a lightweight copy of the regexp structure
13926     when a qr// is returned from the code executed by (??{$qr}) this
13927     lightweight copy doesn't actually own any of its data except for
13928     the starp/end and the actual regexp structure itself.
13929
13930 */
13931
13932
13933 REGEXP *
13934 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13935 {
13936     struct regexp *ret;
13937     struct regexp *const r = (struct regexp *)SvANY(rx);
13938
13939     PERL_ARGS_ASSERT_REG_TEMP_COPY;
13940
13941     if (!ret_x)
13942         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13943     ret = (struct regexp *)SvANY(ret_x);
13944
13945     (void)ReREFCNT_inc(rx);
13946     /* We can take advantage of the existing "copied buffer" mechanism in SVs
13947        by pointing directly at the buffer, but flagging that the allocated
13948        space in the copy is zero. As we've just done a struct copy, it's now
13949        a case of zero-ing that, rather than copying the current length.  */
13950     SvPV_set(ret_x, RX_WRAPPED(rx));
13951     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13952     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13953            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13954     SvLEN_set(ret_x, 0);
13955     SvSTASH_set(ret_x, NULL);
13956     SvMAGIC_set(ret_x, NULL);
13957     if (r->offs) {
13958         const I32 npar = r->nparens+1;
13959         Newx(ret->offs, npar, regexp_paren_pair);
13960         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13961     }
13962     if (r->substrs) {
13963         Newx(ret->substrs, 1, struct reg_substr_data);
13964         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13965
13966         SvREFCNT_inc_void(ret->anchored_substr);
13967         SvREFCNT_inc_void(ret->anchored_utf8);
13968         SvREFCNT_inc_void(ret->float_substr);
13969         SvREFCNT_inc_void(ret->float_utf8);
13970
13971         /* check_substr and check_utf8, if non-NULL, point to either their
13972            anchored or float namesakes, and don't hold a second reference.  */
13973     }
13974     RX_MATCH_COPIED_off(ret_x);
13975 #ifdef PERL_OLD_COPY_ON_WRITE
13976     ret->saved_copy = NULL;
13977 #endif
13978     ret->mother_re = rx;
13979     SvREFCNT_inc_void(ret->qr_anoncv);
13980
13981     return ret_x;
13982 }
13983 #endif
13984
13985 /* regfree_internal()
13986
13987    Free the private data in a regexp. This is overloadable by
13988    extensions. Perl takes care of the regexp structure in pregfree(),
13989    this covers the *pprivate pointer which technically perl doesn't
13990    know about, however of course we have to handle the
13991    regexp_internal structure when no extension is in use.
13992
13993    Note this is called before freeing anything in the regexp
13994    structure.
13995  */
13996
13997 void
13998 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13999 {
14000     dVAR;
14001     struct regexp *const r = (struct regexp *)SvANY(rx);
14002     RXi_GET_DECL(r,ri);
14003     GET_RE_DEBUG_FLAGS_DECL;
14004
14005     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14006
14007     DEBUG_COMPILE_r({
14008         if (!PL_colorset)
14009             reginitcolors();
14010         {
14011             SV *dsv= sv_newmortal();
14012             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14013                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14014             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14015                 PL_colors[4],PL_colors[5],s);
14016         }
14017     });
14018 #ifdef RE_TRACK_PATTERN_OFFSETS
14019     if (ri->u.offsets)
14020         Safefree(ri->u.offsets);             /* 20010421 MJD */
14021 #endif
14022     if (ri->code_blocks) {
14023         int n;
14024         for (n = 0; n < ri->num_code_blocks; n++)
14025             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14026         Safefree(ri->code_blocks);
14027     }
14028
14029     if (ri->data) {
14030         int n = ri->data->count;
14031
14032         while (--n >= 0) {
14033           /* If you add a ->what type here, update the comment in regcomp.h */
14034             switch (ri->data->what[n]) {
14035             case 'a':
14036             case 'r':
14037             case 's':
14038             case 'S':
14039             case 'u':
14040                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14041                 break;
14042             case 'f':
14043                 Safefree(ri->data->data[n]);
14044                 break;
14045             case 'l':
14046             case 'L':
14047                 break;
14048             case 'T':
14049                 { /* Aho Corasick add-on structure for a trie node.
14050                      Used in stclass optimization only */
14051                     U32 refcount;
14052                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14053                     OP_REFCNT_LOCK;
14054                     refcount = --aho->refcount;
14055                     OP_REFCNT_UNLOCK;
14056                     if ( !refcount ) {
14057                         PerlMemShared_free(aho->states);
14058                         PerlMemShared_free(aho->fail);
14059                          /* do this last!!!! */
14060                         PerlMemShared_free(ri->data->data[n]);
14061                         PerlMemShared_free(ri->regstclass);
14062                     }
14063                 }
14064                 break;
14065             case 't':
14066                 {
14067                     /* trie structure. */
14068                     U32 refcount;
14069                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14070                     OP_REFCNT_LOCK;
14071                     refcount = --trie->refcount;
14072                     OP_REFCNT_UNLOCK;
14073                     if ( !refcount ) {
14074                         PerlMemShared_free(trie->charmap);
14075                         PerlMemShared_free(trie->states);
14076                         PerlMemShared_free(trie->trans);
14077                         if (trie->bitmap)
14078                             PerlMemShared_free(trie->bitmap);
14079                         if (trie->jump)
14080                             PerlMemShared_free(trie->jump);
14081                         PerlMemShared_free(trie->wordinfo);
14082                         /* do this last!!!! */
14083                         PerlMemShared_free(ri->data->data[n]);
14084                     }
14085                 }
14086                 break;
14087             default:
14088                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14089             }
14090         }
14091         Safefree(ri->data->what);
14092         Safefree(ri->data);
14093     }
14094
14095     Safefree(ri);
14096 }
14097
14098 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14099 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14100 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14101
14102 /*
14103    re_dup - duplicate a regexp.
14104
14105    This routine is expected to clone a given regexp structure. It is only
14106    compiled under USE_ITHREADS.
14107
14108    After all of the core data stored in struct regexp is duplicated
14109    the regexp_engine.dupe method is used to copy any private data
14110    stored in the *pprivate pointer. This allows extensions to handle
14111    any duplication it needs to do.
14112
14113    See pregfree() and regfree_internal() if you change anything here.
14114 */
14115 #if defined(USE_ITHREADS)
14116 #ifndef PERL_IN_XSUB_RE
14117 void
14118 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14119 {
14120     dVAR;
14121     I32 npar;
14122     const struct regexp *r = (const struct regexp *)SvANY(sstr);
14123     struct regexp *ret = (struct regexp *)SvANY(dstr);
14124
14125     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14126
14127     npar = r->nparens+1;
14128     Newx(ret->offs, npar, regexp_paren_pair);
14129     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14130     if(ret->swap) {
14131         /* no need to copy these */
14132         Newx(ret->swap, npar, regexp_paren_pair);
14133     }
14134
14135     if (ret->substrs) {
14136         /* Do it this way to avoid reading from *r after the StructCopy().
14137            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14138            cache, it doesn't matter.  */
14139         const bool anchored = r->check_substr
14140             ? r->check_substr == r->anchored_substr
14141             : r->check_utf8 == r->anchored_utf8;
14142         Newx(ret->substrs, 1, struct reg_substr_data);
14143         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14144
14145         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14146         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14147         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14148         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14149
14150         /* check_substr and check_utf8, if non-NULL, point to either their
14151            anchored or float namesakes, and don't hold a second reference.  */
14152
14153         if (ret->check_substr) {
14154             if (anchored) {
14155                 assert(r->check_utf8 == r->anchored_utf8);
14156                 ret->check_substr = ret->anchored_substr;
14157                 ret->check_utf8 = ret->anchored_utf8;
14158             } else {
14159                 assert(r->check_substr == r->float_substr);
14160                 assert(r->check_utf8 == r->float_utf8);
14161                 ret->check_substr = ret->float_substr;
14162                 ret->check_utf8 = ret->float_utf8;
14163             }
14164         } else if (ret->check_utf8) {
14165             if (anchored) {
14166                 ret->check_utf8 = ret->anchored_utf8;
14167             } else {
14168                 ret->check_utf8 = ret->float_utf8;
14169             }
14170         }
14171     }
14172
14173     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14174     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14175
14176     if (ret->pprivate)
14177         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14178
14179     if (RX_MATCH_COPIED(dstr))
14180         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14181     else
14182         ret->subbeg = NULL;
14183 #ifdef PERL_OLD_COPY_ON_WRITE
14184     ret->saved_copy = NULL;
14185 #endif
14186
14187     if (ret->mother_re) {
14188         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14189             /* Our storage points directly to our mother regexp, but that's
14190                1: a buffer in a different thread
14191                2: something we no longer hold a reference on
14192                so we need to copy it locally.  */
14193             /* Note we need to use SvCUR(), rather than
14194                SvLEN(), on our mother_re, because it, in
14195                turn, may well be pointing to its own mother_re.  */
14196             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14197                                    SvCUR(ret->mother_re)+1));
14198             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14199         }
14200         ret->mother_re      = NULL;
14201     }
14202     ret->gofs = 0;
14203 }
14204 #endif /* PERL_IN_XSUB_RE */
14205
14206 /*
14207    regdupe_internal()
14208
14209    This is the internal complement to regdupe() which is used to copy
14210    the structure pointed to by the *pprivate pointer in the regexp.
14211    This is the core version of the extension overridable cloning hook.
14212    The regexp structure being duplicated will be copied by perl prior
14213    to this and will be provided as the regexp *r argument, however
14214    with the /old/ structures pprivate pointer value. Thus this routine
14215    may override any copying normally done by perl.
14216
14217    It returns a pointer to the new regexp_internal structure.
14218 */
14219
14220 void *
14221 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14222 {
14223     dVAR;
14224     struct regexp *const r = (struct regexp *)SvANY(rx);
14225     regexp_internal *reti;
14226     int len;
14227     RXi_GET_DECL(r,ri);
14228
14229     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14230
14231     len = ProgLen(ri);
14232
14233     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14234     Copy(ri->program, reti->program, len+1, regnode);
14235
14236     reti->num_code_blocks = ri->num_code_blocks;
14237     if (ri->code_blocks) {
14238         int n;
14239         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14240                 struct reg_code_block);
14241         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14242                 struct reg_code_block);
14243         for (n = 0; n < ri->num_code_blocks; n++)
14244              reti->code_blocks[n].src_regex = (REGEXP*)
14245                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14246     }
14247     else
14248         reti->code_blocks = NULL;
14249
14250     reti->regstclass = NULL;
14251
14252     if (ri->data) {
14253         struct reg_data *d;
14254         const int count = ri->data->count;
14255         int i;
14256
14257         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14258                 char, struct reg_data);
14259         Newx(d->what, count, U8);
14260
14261         d->count = count;
14262         for (i = 0; i < count; i++) {
14263             d->what[i] = ri->data->what[i];
14264             switch (d->what[i]) {
14265                 /* see also regcomp.h and regfree_internal() */
14266             case 'a': /* actually an AV, but the dup function is identical.  */
14267             case 'r':
14268             case 's':
14269             case 'S':
14270             case 'u': /* actually an HV, but the dup function is identical.  */
14271                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14272                 break;
14273             case 'f':
14274                 /* This is cheating. */
14275                 Newx(d->data[i], 1, struct regnode_charclass_class);
14276                 StructCopy(ri->data->data[i], d->data[i],
14277                             struct regnode_charclass_class);
14278                 reti->regstclass = (regnode*)d->data[i];
14279                 break;
14280             case 'T':
14281                 /* Trie stclasses are readonly and can thus be shared
14282                  * without duplication. We free the stclass in pregfree
14283                  * when the corresponding reg_ac_data struct is freed.
14284                  */
14285                 reti->regstclass= ri->regstclass;
14286                 /* Fall through */
14287             case 't':
14288                 OP_REFCNT_LOCK;
14289                 ((reg_trie_data*)ri->data->data[i])->refcount++;
14290                 OP_REFCNT_UNLOCK;
14291                 /* Fall through */
14292             case 'l':
14293             case 'L':
14294                 d->data[i] = ri->data->data[i];
14295                 break;
14296             default:
14297                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14298             }
14299         }
14300
14301         reti->data = d;
14302     }
14303     else
14304         reti->data = NULL;
14305
14306     reti->name_list_idx = ri->name_list_idx;
14307
14308 #ifdef RE_TRACK_PATTERN_OFFSETS
14309     if (ri->u.offsets) {
14310         Newx(reti->u.offsets, 2*len+1, U32);
14311         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14312     }
14313 #else
14314     SetProgLen(reti,len);
14315 #endif
14316
14317     return (void*)reti;
14318 }
14319
14320 #endif    /* USE_ITHREADS */
14321
14322 #ifndef PERL_IN_XSUB_RE
14323
14324 /*
14325  - regnext - dig the "next" pointer out of a node
14326  */
14327 regnode *
14328 Perl_regnext(pTHX_ register regnode *p)
14329 {
14330     dVAR;
14331     I32 offset;
14332
14333     if (!p)
14334         return(NULL);
14335
14336     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
14337         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14338     }
14339
14340     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14341     if (offset == 0)
14342         return(NULL);
14343
14344     return(p+offset);
14345 }
14346 #endif
14347
14348 STATIC void
14349 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14350 {
14351     va_list args;
14352     STRLEN l1 = strlen(pat1);
14353     STRLEN l2 = strlen(pat2);
14354     char buf[512];
14355     SV *msv;
14356     const char *message;
14357
14358     PERL_ARGS_ASSERT_RE_CROAK2;
14359
14360     if (l1 > 510)
14361         l1 = 510;
14362     if (l1 + l2 > 510)
14363         l2 = 510 - l1;
14364     Copy(pat1, buf, l1 , char);
14365     Copy(pat2, buf + l1, l2 , char);
14366     buf[l1 + l2] = '\n';
14367     buf[l1 + l2 + 1] = '\0';
14368 #ifdef I_STDARG
14369     /* ANSI variant takes additional second argument */
14370     va_start(args, pat2);
14371 #else
14372     va_start(args);
14373 #endif
14374     msv = vmess(buf, &args);
14375     va_end(args);
14376     message = SvPV_const(msv,l1);
14377     if (l1 > 512)
14378         l1 = 512;
14379     Copy(message, buf, l1 , char);
14380     buf[l1-1] = '\0';                   /* Overwrite \n */
14381     Perl_croak(aTHX_ "%s", buf);
14382 }
14383
14384 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
14385
14386 #ifndef PERL_IN_XSUB_RE
14387 void
14388 Perl_save_re_context(pTHX)
14389 {
14390     dVAR;
14391
14392     struct re_save_state *state;
14393
14394     SAVEVPTR(PL_curcop);
14395     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14396
14397     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14398     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14399     SSPUSHUV(SAVEt_RE_STATE);
14400
14401     Copy(&PL_reg_state, state, 1, struct re_save_state);
14402
14403     PL_reg_oldsaved = NULL;
14404     PL_reg_oldsavedlen = 0;
14405     PL_reg_maxiter = 0;
14406     PL_reg_leftiter = 0;
14407     PL_reg_poscache = NULL;
14408     PL_reg_poscache_size = 0;
14409 #ifdef PERL_OLD_COPY_ON_WRITE
14410     PL_nrs = NULL;
14411 #endif
14412
14413     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14414     if (PL_curpm) {
14415         const REGEXP * const rx = PM_GETRE(PL_curpm);
14416         if (rx) {
14417             U32 i;
14418             for (i = 1; i <= RX_NPARENS(rx); i++) {
14419                 char digits[TYPE_CHARS(long)];
14420                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14421                 GV *const *const gvp
14422                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14423
14424                 if (gvp) {
14425                     GV * const gv = *gvp;
14426                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14427                         save_scalar(gv);
14428                 }
14429             }
14430         }
14431     }
14432 }
14433 #endif
14434
14435 static void
14436 clear_re(pTHX_ void *r)
14437 {
14438     dVAR;
14439     ReREFCNT_dec((REGEXP *)r);
14440 }
14441
14442 #ifdef DEBUGGING
14443
14444 STATIC void
14445 S_put_byte(pTHX_ SV *sv, int c)
14446 {
14447     PERL_ARGS_ASSERT_PUT_BYTE;
14448
14449     /* Our definition of isPRINT() ignores locales, so only bytes that are
14450        not part of UTF-8 are considered printable. I assume that the same
14451        holds for UTF-EBCDIC.
14452        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14453        which Wikipedia says:
14454
14455        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14456        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14457        identical, to the ASCII delete (DEL) or rubout control character.
14458        ) So the old condition can be simplified to !isPRINT(c)  */
14459     if (!isPRINT(c)) {
14460         if (c < 256) {
14461             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14462         }
14463         else {
14464             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14465         }
14466     }
14467     else {
14468         const char string = c;
14469         if (c == '-' || c == ']' || c == '\\' || c == '^')
14470             sv_catpvs(sv, "\\");
14471         sv_catpvn(sv, &string, 1);
14472     }
14473 }
14474
14475
14476 #define CLEAR_OPTSTART \
14477     if (optstart) STMT_START { \
14478             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14479             optstart=NULL; \
14480     } STMT_END
14481
14482 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14483
14484 STATIC const regnode *
14485 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14486             const regnode *last, const regnode *plast,
14487             SV* sv, I32 indent, U32 depth)
14488 {
14489     dVAR;
14490     U8 op = PSEUDO;     /* Arbitrary non-END op. */
14491     const regnode *next;
14492     const regnode *optstart= NULL;
14493
14494     RXi_GET_DECL(r,ri);
14495     GET_RE_DEBUG_FLAGS_DECL;
14496
14497     PERL_ARGS_ASSERT_DUMPUNTIL;
14498
14499 #ifdef DEBUG_DUMPUNTIL
14500     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14501         last ? last-start : 0,plast ? plast-start : 0);
14502 #endif
14503
14504     if (plast && plast < last)
14505         last= plast;
14506
14507     while (PL_regkind[op] != END && (!last || node < last)) {
14508         /* While that wasn't END last time... */
14509         NODE_ALIGN(node);
14510         op = OP(node);
14511         if (op == CLOSE || op == WHILEM)
14512             indent--;
14513         next = regnext((regnode *)node);
14514
14515         /* Where, what. */
14516         if (OP(node) == OPTIMIZED) {
14517             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14518                 optstart = node;
14519             else
14520                 goto after_print;
14521         } else
14522             CLEAR_OPTSTART;
14523
14524         regprop(r, sv, node);
14525         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14526                       (int)(2*indent + 1), "", SvPVX_const(sv));
14527
14528         if (OP(node) != OPTIMIZED) {
14529             if (next == NULL)           /* Next ptr. */
14530                 PerlIO_printf(Perl_debug_log, " (0)");
14531             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14532                 PerlIO_printf(Perl_debug_log, " (FAIL)");
14533             else
14534                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14535             (void)PerlIO_putc(Perl_debug_log, '\n');
14536         }
14537
14538       after_print:
14539         if (PL_regkind[(U8)op] == BRANCHJ) {
14540             assert(next);
14541             {
14542                 const regnode *nnode = (OP(next) == LONGJMP
14543                                        ? regnext((regnode *)next)
14544                                        : next);
14545                 if (last && nnode > last)
14546                     nnode = last;
14547                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14548             }
14549         }
14550         else if (PL_regkind[(U8)op] == BRANCH) {
14551             assert(next);
14552             DUMPUNTIL(NEXTOPER(node), next);
14553         }
14554         else if ( PL_regkind[(U8)op]  == TRIE ) {
14555             const regnode *this_trie = node;
14556             const char op = OP(node);
14557             const U32 n = ARG(node);
14558             const reg_ac_data * const ac = op>=AHOCORASICK ?
14559                (reg_ac_data *)ri->data->data[n] :
14560                NULL;
14561             const reg_trie_data * const trie =
14562                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14563 #ifdef DEBUGGING
14564             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14565 #endif
14566             const regnode *nextbranch= NULL;
14567             I32 word_idx;
14568             sv_setpvs(sv, "");
14569             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14570                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14571
14572                 PerlIO_printf(Perl_debug_log, "%*s%s ",
14573                    (int)(2*(indent+3)), "",
14574                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14575                             PL_colors[0], PL_colors[1],
14576                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14577                             PERL_PV_PRETTY_ELLIPSES    |
14578                             PERL_PV_PRETTY_LTGT
14579                             )
14580                             : "???"
14581                 );
14582                 if (trie->jump) {
14583                     U16 dist= trie->jump[word_idx+1];
14584                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14585                                   (UV)((dist ? this_trie + dist : next) - start));
14586                     if (dist) {
14587                         if (!nextbranch)
14588                             nextbranch= this_trie + trie->jump[0];
14589                         DUMPUNTIL(this_trie + dist, nextbranch);
14590                     }
14591                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14592                         nextbranch= regnext((regnode *)nextbranch);
14593                 } else {
14594                     PerlIO_printf(Perl_debug_log, "\n");
14595                 }
14596             }
14597             if (last && next > last)
14598                 node= last;
14599             else
14600                 node= next;
14601         }
14602         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
14603             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14604                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14605         }
14606         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14607             assert(next);
14608             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14609         }
14610         else if ( op == PLUS || op == STAR) {
14611             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14612         }
14613         else if (PL_regkind[(U8)op] == ANYOF) {
14614             /* arglen 1 + class block */
14615             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14616                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14617             node = NEXTOPER(node);
14618         }
14619         else if (PL_regkind[(U8)op] == EXACT) {
14620             /* Literal string, where present. */
14621             node += NODE_SZ_STR(node) - 1;
14622             node = NEXTOPER(node);
14623         }
14624         else {
14625             node = NEXTOPER(node);
14626             node += regarglen[(U8)op];
14627         }
14628         if (op == CURLYX || op == OPEN)
14629             indent++;
14630     }
14631     CLEAR_OPTSTART;
14632 #ifdef DEBUG_DUMPUNTIL
14633     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14634 #endif
14635     return node;
14636 }
14637
14638 #endif  /* DEBUGGING */
14639
14640 /*
14641  * Local variables:
14642  * c-indentation-style: bsd
14643  * c-basic-offset: 4
14644  * indent-tabs-mode: nil
14645  * End:
14646  *
14647  * ex: set ts=8 sts=4 sw=4 et:
14648  */